tclRegexp.cGo to the documentation of this file.00001 /* 00002 * tclRegexp.c -- 00003 * 00004 * This file contains the public interfaces to the Tcl regular expression 00005 * mechanism. 00006 * 00007 * Copyright (c) 1998 by Sun Microsystems, Inc. 00008 * Copyright (c) 1998-1999 by Scriptics Corporation. 00009 * 00010 * See the file "license.terms" for information on usage and redistribution of 00011 * this file, and for a DISCLAIMER OF ALL WARRANTIES. 00012 * 00013 * RCS: @(#) $Id: tclRegexp.c,v 1.28 2007/12/13 15:23:20 dgp Exp $ 00014 */ 00015 00016 #include "tclInt.h" 00017 #include "tclRegexp.h" 00018 00019 /* 00020 *---------------------------------------------------------------------- 00021 * The routines in this file use Henry Spencer's regular expression package 00022 * contained in the following additional source files: 00023 * 00024 * regc_color.c regc_cvec.c regc_lex.c 00025 * regc_nfa.c regcomp.c regcustom.h 00026 * rege_dfa.c regerror.c regerrs.h 00027 * regex.h regexec.c regfree.c 00028 * regfronts.c regguts.h 00029 * 00030 * Copyright (c) 1998 Henry Spencer. All rights reserved. 00031 * 00032 * Development of this software was funded, in part, by Cray Research Inc., 00033 * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics 00034 * Corporation, none of whom are responsible for the results. The author 00035 * thanks all of them. 00036 * 00037 * Redistribution and use in source and binary forms -- with or without 00038 * modification -- are permitted for any purpose, provided that 00039 * redistributions in source form retain this entire copyright notice and 00040 * indicate the origin and nature of any modifications. 00041 * 00042 * I'd appreciate being given credit for this package in the documentation of 00043 * software which uses it, but that is not a requirement. 00044 * 00045 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, 00046 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY 00047 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL 00048 * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 00049 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 00050 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 00051 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 00052 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 00053 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 00054 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00055 * 00056 * *** NOTE: this code has been altered slightly for use in Tcl: *** 00057 * *** 1. Names have been changed, e.g. from re_comp to *** 00058 * *** TclRegComp, to avoid clashes with other *** 00059 * *** regexp implementations used by applications. *** 00060 */ 00061 00062 /* 00063 * Thread local storage used to maintain a per-thread cache of compiled 00064 * regular expressions. 00065 */ 00066 00067 #define NUM_REGEXPS 30 00068 00069 typedef struct ThreadSpecificData { 00070 int initialized; /* Set to 1 when the module is initialized. */ 00071 char *patterns[NUM_REGEXPS];/* Strings corresponding to compiled regular 00072 * expression patterns. NULL means that this 00073 * slot isn't used. Malloc-ed. */ 00074 int patLengths[NUM_REGEXPS];/* Number of non-null characters in 00075 * corresponding entry in patterns. -1 means 00076 * entry isn't used. */ 00077 struct TclRegexp *regexps[NUM_REGEXPS]; 00078 /* Compiled forms of above strings. Also 00079 * malloc-ed, or NULL if not in use yet. */ 00080 } ThreadSpecificData; 00081 00082 static Tcl_ThreadDataKey dataKey; 00083 00084 /* 00085 * Declarations for functions used only in this file. 00086 */ 00087 00088 static TclRegexp * CompileRegexp(Tcl_Interp *interp, const char *pattern, 00089 int length, int flags); 00090 static void DupRegexpInternalRep(Tcl_Obj *srcPtr, 00091 Tcl_Obj *copyPtr); 00092 static void FinalizeRegexp(ClientData clientData); 00093 static void FreeRegexp(TclRegexp *regexpPtr); 00094 static void FreeRegexpInternalRep(Tcl_Obj *objPtr); 00095 static int RegExpExecUniChar(Tcl_Interp *interp, Tcl_RegExp re, 00096 const Tcl_UniChar *uniString, int numChars, 00097 int nmatches, int flags); 00098 static int SetRegexpFromAny(Tcl_Interp *interp, Tcl_Obj *objPtr); 00099 00100 /* 00101 * The regular expression Tcl object type. This serves as a cache of the 00102 * compiled form of the regular expression. 00103 */ 00104 00105 Tcl_ObjType tclRegexpType = { 00106 "regexp", /* name */ 00107 FreeRegexpInternalRep, /* freeIntRepProc */ 00108 DupRegexpInternalRep, /* dupIntRepProc */ 00109 NULL, /* updateStringProc */ 00110 SetRegexpFromAny /* setFromAnyProc */ 00111 }; 00112 00113 /* 00114 *---------------------------------------------------------------------- 00115 * 00116 * Tcl_RegExpCompile -- 00117 * 00118 * Compile a regular expression into a form suitable for fast matching. 00119 * This function is DEPRECATED in favor of the object version of the 00120 * command. 00121 * 00122 * Results: 00123 * The return value is a pointer to the compiled form of string, suitable 00124 * for passing to Tcl_RegExpExec. This compiled form is only valid up 00125 * until the next call to this function, so don't keep these around for a 00126 * long time! If an error occurred while compiling the pattern, then NULL 00127 * is returned and an error message is left in the interp's result. 00128 * 00129 * Side effects: 00130 * Updates the cache of compiled regexps. 00131 * 00132 *---------------------------------------------------------------------- 00133 */ 00134 00135 Tcl_RegExp 00136 Tcl_RegExpCompile( 00137 Tcl_Interp *interp, /* For use in error reporting and to access 00138 * the interp regexp cache. */ 00139 const char *pattern) /* String for which to produce compiled 00140 * regular expression. */ 00141 { 00142 return (Tcl_RegExp) CompileRegexp(interp, pattern, (int) strlen(pattern), 00143 REG_ADVANCED); 00144 } 00145 00146 /* 00147 *---------------------------------------------------------------------- 00148 * 00149 * Tcl_RegExpExec -- 00150 * 00151 * Execute the regular expression matcher using a compiled form of a 00152 * regular expression and save information about any match that is found. 00153 * 00154 * Results: 00155 * If an error occurs during the matching operation then -1 is returned 00156 * and the interp's result contains an error message. Otherwise the 00157 * return value is 1 if a matching range is found and 0 if there is no 00158 * matching range. 00159 * 00160 * Side effects: 00161 * None. 00162 * 00163 *---------------------------------------------------------------------- 00164 */ 00165 00166 int 00167 Tcl_RegExpExec( 00168 Tcl_Interp *interp, /* Interpreter to use for error reporting. */ 00169 Tcl_RegExp re, /* Compiled regular expression; must have been 00170 * returned by previous call to 00171 * Tcl_GetRegExpFromObj. */ 00172 const char *text, /* Text against which to match re. */ 00173 const char *start) /* If text is part of a larger string, this 00174 * identifies beginning of larger string, so 00175 * that "^" won't match. */ 00176 { 00177 int flags, result, numChars; 00178 TclRegexp *regexp = (TclRegexp *)re; 00179 Tcl_DString ds; 00180 const Tcl_UniChar *ustr; 00181 00182 /* 00183 * If the starting point is offset from the beginning of the buffer, then 00184 * we need to tell the regexp engine not to match "^". 00185 */ 00186 00187 if (text > start) { 00188 flags = REG_NOTBOL; 00189 } else { 00190 flags = 0; 00191 } 00192 00193 /* 00194 * Remember the string for use by Tcl_RegExpRange(). 00195 */ 00196 00197 regexp->string = text; 00198 regexp->objPtr = NULL; 00199 00200 /* 00201 * Convert the string to Unicode and perform the match. 00202 */ 00203 00204 Tcl_DStringInit(&ds); 00205 ustr = Tcl_UtfToUniCharDString(text, -1, &ds); 00206 numChars = Tcl_DStringLength(&ds) / sizeof(Tcl_UniChar); 00207 result = RegExpExecUniChar(interp, re, ustr, numChars, -1 /* nmatches */, 00208 flags); 00209 Tcl_DStringFree(&ds); 00210 00211 return result; 00212 } 00213 00214 /* 00215 *--------------------------------------------------------------------------- 00216 * 00217 * Tcl_RegExpRange -- 00218 * 00219 * Returns pointers describing the range of a regular expression match, 00220 * or one of the subranges within the match. 00221 * 00222 * Results: 00223 * The variables at *startPtr and *endPtr are modified to hold the 00224 * addresses of the endpoints of the range given by index. If the 00225 * specified range doesn't exist then NULLs are returned. 00226 * 00227 * Side effects: 00228 * None. 00229 * 00230 *--------------------------------------------------------------------------- 00231 */ 00232 00233 void 00234 Tcl_RegExpRange( 00235 Tcl_RegExp re, /* Compiled regular expression that has been 00236 * passed to Tcl_RegExpExec. */ 00237 int index, /* 0 means give the range of the entire match, 00238 * > 0 means give the range of a matching 00239 * subrange. */ 00240 const char **startPtr, /* Store address of first character in 00241 * (sub-)range here. */ 00242 const char **endPtr) /* Store address of character just after last 00243 * in (sub-)range here. */ 00244 { 00245 TclRegexp *regexpPtr = (TclRegexp *) re; 00246 const char *string; 00247 00248 if ((size_t) index > regexpPtr->re.re_nsub) { 00249 *startPtr = *endPtr = NULL; 00250 } else if (regexpPtr->matches[index].rm_so < 0) { 00251 *startPtr = *endPtr = NULL; 00252 } else { 00253 if (regexpPtr->objPtr) { 00254 string = TclGetString(regexpPtr->objPtr); 00255 } else { 00256 string = regexpPtr->string; 00257 } 00258 *startPtr = Tcl_UtfAtIndex(string, regexpPtr->matches[index].rm_so); 00259 *endPtr = Tcl_UtfAtIndex(string, regexpPtr->matches[index].rm_eo); 00260 } 00261 } 00262 00263 /* 00264 *--------------------------------------------------------------------------- 00265 * 00266 * RegExpExecUniChar -- 00267 * 00268 * Execute the regular expression matcher using a compiled form of a 00269 * regular expression and save information about any match that is found. 00270 * 00271 * Results: 00272 * If an error occurs during the matching operation then -1 is returned 00273 * and an error message is left in interp's result. Otherwise the return 00274 * value is 1 if a matching range was found or 0 if there was no matching 00275 * range. 00276 * 00277 * Side effects: 00278 * None. 00279 * 00280 *---------------------------------------------------------------------- 00281 */ 00282 00283 static int 00284 RegExpExecUniChar( 00285 Tcl_Interp *interp, /* Interpreter to use for error reporting. */ 00286 Tcl_RegExp re, /* Compiled regular expression; returned by a 00287 * previous call to Tcl_GetRegExpFromObj */ 00288 const Tcl_UniChar *wString, /* String against which to match re. */ 00289 int numChars, /* Length of Tcl_UniChar string (must be 00290 * >=0). */ 00291 int nmatches, /* How many subexpression matches (counting 00292 * the whole match as subexpression 0) are of 00293 * interest. -1 means "don't know". */ 00294 int flags) /* Regular expression flags. */ 00295 { 00296 int status; 00297 TclRegexp *regexpPtr = (TclRegexp *) re; 00298 size_t last = regexpPtr->re.re_nsub + 1; 00299 size_t nm = last; 00300 00301 if (nmatches >= 0 && (size_t) nmatches < nm) { 00302 nm = (size_t) nmatches; 00303 } 00304 00305 status = TclReExec(®expPtr->re, wString, (size_t) numChars, 00306 ®expPtr->details, nm, regexpPtr->matches, flags); 00307 00308 /* 00309 * Check for errors. 00310 */ 00311 00312 if (status != REG_OKAY) { 00313 if (status == REG_NOMATCH) { 00314 return 0; 00315 } 00316 if (interp != NULL) { 00317 TclRegError(interp, "error while matching regular expression: ", 00318 status); 00319 } 00320 return -1; 00321 } 00322 return 1; 00323 } 00324 00325 /* 00326 *--------------------------------------------------------------------------- 00327 * 00328 * TclRegExpRangeUniChar -- 00329 * 00330 * Returns pointers describing the range of a regular expression match, 00331 * or one of the subranges within the match, or the hypothetical range 00332 * represented by the rm_extend field of the rm_detail_t. 00333 * 00334 * Results: 00335 * The variables at *startPtr and *endPtr are modified to hold the 00336 * offsets of the endpoints of the range given by index. If the specified 00337 * range doesn't exist then -1s are supplied. 00338 * 00339 * Side effects: 00340 * None. 00341 * 00342 *--------------------------------------------------------------------------- 00343 */ 00344 00345 void 00346 TclRegExpRangeUniChar( 00347 Tcl_RegExp re, /* Compiled regular expression that has been 00348 * passed to Tcl_RegExpExec. */ 00349 int index, /* 0 means give the range of the entire match, 00350 * > 0 means give the range of a matching 00351 * subrange, -1 means the range of the 00352 * rm_extend field. */ 00353 int *startPtr, /* Store address of first character in 00354 * (sub-)range here. */ 00355 int *endPtr) /* Store address of character just after last 00356 * in (sub-)range here. */ 00357 { 00358 TclRegexp *regexpPtr = (TclRegexp *) re; 00359 00360 if ((regexpPtr->flags®_EXPECT) && index == -1) { 00361 *startPtr = regexpPtr->details.rm_extend.rm_so; 00362 *endPtr = regexpPtr->details.rm_extend.rm_eo; 00363 } else if ((size_t) index > regexpPtr->re.re_nsub) { 00364 *startPtr = -1; 00365 *endPtr = -1; 00366 } else { 00367 *startPtr = regexpPtr->matches[index].rm_so; 00368 *endPtr = regexpPtr->matches[index].rm_eo; 00369 } 00370 } 00371 00372 /* 00373 *---------------------------------------------------------------------- 00374 * 00375 * Tcl_RegExpMatch -- 00376 * 00377 * See if a string matches a regular expression. 00378 * 00379 * Results: 00380 * If an error occurs during the matching operation then -1 is returned 00381 * and the interp's result contains an error message. Otherwise the 00382 * return value is 1 if "text" matches "pattern" and 0 otherwise. 00383 * 00384 * Side effects: 00385 * None. 00386 * 00387 *---------------------------------------------------------------------- 00388 */ 00389 00390 int 00391 Tcl_RegExpMatch( 00392 Tcl_Interp *interp, /* Used for error reporting. May be NULL. */ 00393 const char *text, /* Text to search for pattern matches. */ 00394 const char *pattern) /* Regular expression to match against text. */ 00395 { 00396 Tcl_RegExp re; 00397 00398 re = Tcl_RegExpCompile(interp, pattern); 00399 if (re == NULL) { 00400 return -1; 00401 } 00402 return Tcl_RegExpExec(interp, re, text, text); 00403 } 00404 00405 /* 00406 *---------------------------------------------------------------------- 00407 * 00408 * Tcl_RegExpExecObj -- 00409 * 00410 * Execute a precompiled regexp against the given object. 00411 * 00412 * Results: 00413 * If an error occurs during the matching operation then -1 is returned 00414 * and the interp's result contains an error message. Otherwise the 00415 * return value is 1 if "string" matches "pattern" and 0 otherwise. 00416 * 00417 * Side effects: 00418 * Converts the object to a Unicode object. 00419 * 00420 *---------------------------------------------------------------------- 00421 */ 00422 00423 int 00424 Tcl_RegExpExecObj( 00425 Tcl_Interp *interp, /* Interpreter to use for error reporting. */ 00426 Tcl_RegExp re, /* Compiled regular expression; must have been 00427 * returned by previous call to 00428 * Tcl_GetRegExpFromObj. */ 00429 Tcl_Obj *textObj, /* Text against which to match re. */ 00430 int offset, /* Character index that marks where matching 00431 * should begin. */ 00432 int nmatches, /* How many subexpression matches (counting 00433 * the whole match as subexpression 0) are of 00434 * interest. -1 means all of them. */ 00435 int flags) /* Regular expression execution flags. */ 00436 { 00437 TclRegexp *regexpPtr = (TclRegexp *) re; 00438 Tcl_UniChar *udata; 00439 int length; 00440 int reflags = regexpPtr->flags; 00441 #define TCL_REG_GLOBOK_FLAGS (TCL_REG_ADVANCED | TCL_REG_NOSUB | TCL_REG_NOCASE) 00442 00443 /* 00444 * Take advantage of the equivalent glob pattern, if one exists. 00445 * This is possible based only on the right mix of incoming flags (0) 00446 * and regexp compile flags. 00447 */ 00448 if ((offset == 0) && (nmatches == 0) && (flags == 0) 00449 && !(reflags & ~TCL_REG_GLOBOK_FLAGS) 00450 && (regexpPtr->globObjPtr != NULL)) { 00451 int nocase = (reflags & TCL_REG_NOCASE) ? TCL_MATCH_NOCASE : 0; 00452 00453 /* 00454 * Pass to TclStringMatchObj for obj-specific handling. 00455 * XXX: Currently doesn't take advantage of exact-ness that 00456 * XXX: TclReToGlob tells us about 00457 */ 00458 00459 return TclStringMatchObj(textObj, regexpPtr->globObjPtr, nocase); 00460 } 00461 00462 /* 00463 * Save the target object so we can extract strings from it later. 00464 */ 00465 00466 regexpPtr->string = NULL; 00467 regexpPtr->objPtr = textObj; 00468 00469 udata = Tcl_GetUnicodeFromObj(textObj, &length); 00470 00471 if (offset > length) { 00472 offset = length; 00473 } 00474 udata += offset; 00475 length -= offset; 00476 00477 return RegExpExecUniChar(interp, re, udata, length, nmatches, flags); 00478 } 00479 00480 /* 00481 *---------------------------------------------------------------------- 00482 * 00483 * Tcl_RegExpMatchObj -- 00484 * 00485 * See if an object matches a regular expression. 00486 * 00487 * Results: 00488 * If an error occurs during the matching operation then -1 is returned 00489 * and the interp's result contains an error message. Otherwise the 00490 * return value is 1 if "text" matches "pattern" and 0 otherwise. 00491 * 00492 * Side effects: 00493 * Changes the internal rep of the pattern and string objects. 00494 * 00495 *---------------------------------------------------------------------- 00496 */ 00497 00498 int 00499 Tcl_RegExpMatchObj( 00500 Tcl_Interp *interp, /* Used for error reporting. May be NULL. */ 00501 Tcl_Obj *textObj, /* Object containing the String to search. */ 00502 Tcl_Obj *patternObj) /* Regular expression to match against 00503 * string. */ 00504 { 00505 Tcl_RegExp re; 00506 00507 re = Tcl_GetRegExpFromObj(interp, patternObj, 00508 TCL_REG_ADVANCED | TCL_REG_NOSUB); 00509 if (re == NULL) { 00510 return -1; 00511 } 00512 return Tcl_RegExpExecObj(interp, re, textObj, 0 /* offset */, 00513 0 /* nmatches */, 0 /* flags */); 00514 } 00515 00516 /* 00517 *---------------------------------------------------------------------- 00518 * 00519 * Tcl_RegExpGetInfo -- 00520 * 00521 * Retrieve information about the current match. 00522 * 00523 * Results: 00524 * None. 00525 * 00526 * Side effects: 00527 * None. 00528 * 00529 *---------------------------------------------------------------------- 00530 */ 00531 00532 void 00533 Tcl_RegExpGetInfo( 00534 Tcl_RegExp regexp, /* Pattern from which to get subexpressions. */ 00535 Tcl_RegExpInfo *infoPtr) /* Match information is stored here. */ 00536 { 00537 TclRegexp *regexpPtr = (TclRegexp *) regexp; 00538 00539 infoPtr->nsubs = regexpPtr->re.re_nsub; 00540 infoPtr->matches = (Tcl_RegExpIndices *) regexpPtr->matches; 00541 infoPtr->extendStart = regexpPtr->details.rm_extend.rm_so; 00542 } 00543 00544 /* 00545 *---------------------------------------------------------------------- 00546 * 00547 * Tcl_GetRegExpFromObj -- 00548 * 00549 * Compile a regular expression into a form suitable for fast matching. 00550 * This function caches the result in a Tcl_Obj. 00551 * 00552 * Results: 00553 * The return value is a pointer to the compiled form of string, suitable 00554 * for passing to Tcl_RegExpExec. If an error occurred while compiling 00555 * the pattern, then NULL is returned and an error message is left in the 00556 * interp's result. 00557 * 00558 * Side effects: 00559 * Updates the native rep of the Tcl_Obj. 00560 * 00561 *---------------------------------------------------------------------- 00562 */ 00563 00564 Tcl_RegExp 00565 Tcl_GetRegExpFromObj( 00566 Tcl_Interp *interp, /* For use in error reporting, and to access 00567 * the interp regexp cache. */ 00568 Tcl_Obj *objPtr, /* Object whose string rep contains regular 00569 * expression pattern. Internal rep will be 00570 * changed to compiled form of this regular 00571 * expression. */ 00572 int flags) /* Regular expression compilation flags. */ 00573 { 00574 int length; 00575 TclRegexp *regexpPtr; 00576 char *pattern; 00577 00578 /* 00579 * This is OK because we only actually interpret this value properly as a 00580 * TclRegexp* when the type is tclRegexpType. 00581 */ 00582 00583 regexpPtr = (TclRegexp *) objPtr->internalRep.otherValuePtr; 00584 00585 if ((objPtr->typePtr != &tclRegexpType) || (regexpPtr->flags != flags)) { 00586 pattern = TclGetStringFromObj(objPtr, &length); 00587 00588 regexpPtr = CompileRegexp(interp, pattern, length, flags); 00589 if (regexpPtr == NULL) { 00590 return NULL; 00591 } 00592 00593 /* 00594 * Add a reference to the regexp so it will persist even if it is 00595 * pushed out of the current thread's regexp cache. This reference 00596 * will be removed when the object's internal rep is freed. 00597 */ 00598 00599 regexpPtr->refCount++; 00600 00601 /* 00602 * Free the old representation and set our type. 00603 */ 00604 00605 TclFreeIntRep(objPtr); 00606 objPtr->internalRep.otherValuePtr = (void *) regexpPtr; 00607 objPtr->typePtr = &tclRegexpType; 00608 } 00609 return (Tcl_RegExp) regexpPtr; 00610 } 00611 00612 /* 00613 *---------------------------------------------------------------------- 00614 * 00615 * TclRegAbout -- 00616 * 00617 * Return information about a compiled regular expression. 00618 * 00619 * Results: 00620 * The return value is -1 for failure, 0 for success, although at the 00621 * moment there's nothing that could fail. On success, a list is left in 00622 * the interp's result: first element is the subexpression count, second 00623 * is a list of re_info bit names. 00624 * 00625 * Side effects: 00626 * None. 00627 * 00628 *---------------------------------------------------------------------- 00629 */ 00630 00631 int 00632 TclRegAbout( 00633 Tcl_Interp *interp, /* For use in variable assignment. */ 00634 Tcl_RegExp re) /* The compiled regular expression. */ 00635 { 00636 TclRegexp *regexpPtr = (TclRegexp *) re; 00637 struct infoname { 00638 int bit; 00639 const char *text; 00640 }; 00641 static const struct infoname infonames[] = { 00642 {REG_UBACKREF, "REG_UBACKREF"}, 00643 {REG_ULOOKAHEAD, "REG_ULOOKAHEAD"}, 00644 {REG_UBOUNDS, "REG_UBOUNDS"}, 00645 {REG_UBRACES, "REG_UBRACES"}, 00646 {REG_UBSALNUM, "REG_UBSALNUM"}, 00647 {REG_UPBOTCH, "REG_UPBOTCH"}, 00648 {REG_UBBS, "REG_UBBS"}, 00649 {REG_UNONPOSIX, "REG_UNONPOSIX"}, 00650 {REG_UUNSPEC, "REG_UUNSPEC"}, 00651 {REG_UUNPORT, "REG_UUNPORT"}, 00652 {REG_ULOCALE, "REG_ULOCALE"}, 00653 {REG_UEMPTYMATCH, "REG_UEMPTYMATCH"}, 00654 {REG_UIMPOSSIBLE, "REG_UIMPOSSIBLE"}, 00655 {REG_USHORTEST, "REG_USHORTEST"}, 00656 {0, NULL} 00657 }; 00658 const struct infoname *inf; 00659 Tcl_Obj *infoObj; 00660 00661 /* 00662 * The reset here guarantees that the interpreter result is empty and 00663 * unshared. This means that we can use Tcl_ListObjAppendElement on the 00664 * result object quite safely. 00665 */ 00666 00667 Tcl_ResetResult(interp); 00668 00669 /* 00670 * Assume that there will never be more than INT_MAX subexpressions. This 00671 * is a pretty reasonable assumption; the RE engine doesn't scale _that_ 00672 * well and Tcl has other limits that constrain things as well... 00673 */ 00674 00675 Tcl_ListObjAppendElement(NULL, Tcl_GetObjResult(interp), 00676 Tcl_NewIntObj((int) regexpPtr->re.re_nsub)); 00677 00678 /* 00679 * Now append a list of all the bit-flags set for the RE. 00680 */ 00681 00682 TclNewObj(infoObj); 00683 for (inf=infonames ; inf->bit != 0 ; inf++) { 00684 if (regexpPtr->re.re_info & inf->bit) { 00685 Tcl_ListObjAppendElement(NULL, infoObj, 00686 Tcl_NewStringObj(inf->text, -1)); 00687 } 00688 } 00689 Tcl_ListObjAppendElement(NULL, Tcl_GetObjResult(interp), infoObj); 00690 00691 return 0; 00692 } 00693 00694 /* 00695 *---------------------------------------------------------------------- 00696 * 00697 * TclRegError -- 00698 * 00699 * Generate an error message based on the regexp status code. 00700 * 00701 * Results: 00702 * Places an error in the interpreter. 00703 * 00704 * Side effects: 00705 * Sets errorCode as well. 00706 * 00707 *---------------------------------------------------------------------- 00708 */ 00709 00710 void 00711 TclRegError( 00712 Tcl_Interp *interp, /* Interpreter for error reporting. */ 00713 const char *msg, /* Message to prepend to error. */ 00714 int status) /* Status code to report. */ 00715 { 00716 char buf[100]; /* ample in practice */ 00717 char cbuf[100]; /* lots in practice */ 00718 size_t n; 00719 const char *p; 00720 00721 Tcl_ResetResult(interp); 00722 n = TclReError(status, NULL, buf, sizeof(buf)); 00723 p = (n > sizeof(buf)) ? "..." : ""; 00724 Tcl_AppendResult(interp, msg, buf, p, NULL); 00725 00726 sprintf(cbuf, "%d", status); 00727 (void) TclReError(REG_ITOA, NULL, cbuf, sizeof(cbuf)); 00728 Tcl_SetErrorCode(interp, "REGEXP", cbuf, buf, NULL); 00729 } 00730 00731 /* 00732 *---------------------------------------------------------------------- 00733 * 00734 * FreeRegexpInternalRep -- 00735 * 00736 * Deallocate the storage associated with a regexp object's internal 00737 * representation. 00738 * 00739 * Results: 00740 * None. 00741 * 00742 * Side effects: 00743 * Frees the compiled regular expression. 00744 * 00745 *---------------------------------------------------------------------- 00746 */ 00747 00748 static void 00749 FreeRegexpInternalRep( 00750 Tcl_Obj *objPtr) /* Regexp object with internal rep to free. */ 00751 { 00752 TclRegexp *regexpRepPtr = (TclRegexp *) objPtr->internalRep.otherValuePtr; 00753 00754 /* 00755 * If this is the last reference to the regexp, free it. 00756 */ 00757 00758 if (--(regexpRepPtr->refCount) <= 0) { 00759 FreeRegexp(regexpRepPtr); 00760 } 00761 } 00762 00763 /* 00764 *---------------------------------------------------------------------- 00765 * 00766 * DupRegexpInternalRep -- 00767 * 00768 * We copy the reference to the compiled regexp and bump its reference 00769 * count. 00770 * 00771 * Results: 00772 * None. 00773 * 00774 * Side effects: 00775 * Increments the reference count of the regexp. 00776 * 00777 *---------------------------------------------------------------------- 00778 */ 00779 00780 static void 00781 DupRegexpInternalRep( 00782 Tcl_Obj *srcPtr, /* Object with internal rep to copy. */ 00783 Tcl_Obj *copyPtr) /* Object with internal rep to set. */ 00784 { 00785 TclRegexp *regexpPtr = (TclRegexp *) srcPtr->internalRep.otherValuePtr; 00786 00787 regexpPtr->refCount++; 00788 copyPtr->internalRep.otherValuePtr = srcPtr->internalRep.otherValuePtr; 00789 copyPtr->typePtr = &tclRegexpType; 00790 } 00791 00792 /* 00793 *---------------------------------------------------------------------- 00794 * 00795 * SetRegexpFromAny -- 00796 * 00797 * Attempt to generate a compiled regular expression for the Tcl object 00798 * "objPtr". 00799 * 00800 * Results: 00801 * The return value is TCL_OK or TCL_ERROR. If an error occurs during 00802 * conversion, an error message is left in the interpreter's result 00803 * unless "interp" is NULL. 00804 * 00805 * Side effects: 00806 * If no error occurs, a regular expression is stored as "objPtr"s 00807 * internal representation. 00808 * 00809 *---------------------------------------------------------------------- 00810 */ 00811 00812 static int 00813 SetRegexpFromAny( 00814 Tcl_Interp *interp, /* Used for error reporting if not NULL. */ 00815 Tcl_Obj *objPtr) /* The object to convert. */ 00816 { 00817 if (Tcl_GetRegExpFromObj(interp, objPtr, REG_ADVANCED) == NULL) { 00818 return TCL_ERROR; 00819 } 00820 return TCL_OK; 00821 } 00822 00823 /* 00824 *--------------------------------------------------------------------------- 00825 * 00826 * CompileRegexp -- 00827 * 00828 * Attempt to compile the given regexp pattern. If the compiled regular 00829 * expression can be found in the per-thread cache, it will be used 00830 * instead of compiling a new copy. 00831 * 00832 * Results: 00833 * The return value is a pointer to a newly allocated TclRegexp that 00834 * represents the compiled pattern, or NULL if the pattern could not be 00835 * compiled. If NULL is returned, an error message is left in the 00836 * interp's result. 00837 * 00838 * Side effects: 00839 * The thread-local regexp cache is updated and a new TclRegexp may be 00840 * allocated. 00841 * 00842 *---------------------------------------------------------------------- 00843 */ 00844 00845 static TclRegexp * 00846 CompileRegexp( 00847 Tcl_Interp *interp, /* Used for error reporting if not NULL. */ 00848 const char *string, /* The regexp to compile (UTF-8). */ 00849 int length, /* The length of the string in bytes. */ 00850 int flags) /* Compilation flags. */ 00851 { 00852 TclRegexp *regexpPtr; 00853 const Tcl_UniChar *uniString; 00854 int numChars, status, i, exact; 00855 Tcl_DString stringBuf; 00856 ThreadSpecificData *tsdPtr = TCL_TSD_INIT(&dataKey); 00857 00858 if (!tsdPtr->initialized) { 00859 tsdPtr->initialized = 1; 00860 Tcl_CreateThreadExitHandler(FinalizeRegexp, NULL); 00861 } 00862 00863 /* 00864 * This routine maintains a second-level regular expression cache in 00865 * addition to the per-object regexp cache. The per-thread cache is needed 00866 * to handle the case where for various reasons the object is lost between 00867 * invocations of the regexp command, but the literal pattern is the same. 00868 */ 00869 00870 /* 00871 * Check the per-thread compiled regexp cache. We can only reuse a regexp 00872 * if it has the same pattern and the same flags. 00873 */ 00874 00875 for (i = 0; (i < NUM_REGEXPS) && (tsdPtr->patterns[i] != NULL); i++) { 00876 if ((length == tsdPtr->patLengths[i]) 00877 && (tsdPtr->regexps[i]->flags == flags) 00878 && (strcmp(string, tsdPtr->patterns[i]) == 0)) { 00879 /* 00880 * Move the matched pattern to the first slot in the cache and 00881 * shift the other patterns down one position. 00882 */ 00883 00884 if (i != 0) { 00885 int j; 00886 char *cachedString; 00887 00888 cachedString = tsdPtr->patterns[i]; 00889 regexpPtr = tsdPtr->regexps[i]; 00890 for (j = i-1; j >= 0; j--) { 00891 tsdPtr->patterns[j+1] = tsdPtr->patterns[j]; 00892 tsdPtr->patLengths[j+1] = tsdPtr->patLengths[j]; 00893 tsdPtr->regexps[j+1] = tsdPtr->regexps[j]; 00894 } 00895 tsdPtr->patterns[0] = cachedString; 00896 tsdPtr->patLengths[0] = length; 00897 tsdPtr->regexps[0] = regexpPtr; 00898 } 00899 return tsdPtr->regexps[0]; 00900 } 00901 } 00902 00903 /* 00904 * This is a new expression, so compile it and add it to the cache. 00905 */ 00906 00907 regexpPtr = (TclRegexp *) ckalloc(sizeof(TclRegexp)); 00908 regexpPtr->objPtr = NULL; 00909 regexpPtr->string = NULL; 00910 regexpPtr->details.rm_extend.rm_so = -1; 00911 regexpPtr->details.rm_extend.rm_eo = -1; 00912 00913 /* 00914 * Get the up-to-date string representation and map to unicode. 00915 */ 00916 00917 Tcl_DStringInit(&stringBuf); 00918 uniString = Tcl_UtfToUniCharDString(string, length, &stringBuf); 00919 numChars = Tcl_DStringLength(&stringBuf) / sizeof(Tcl_UniChar); 00920 00921 /* 00922 * Compile the string and check for errors. 00923 */ 00924 00925 regexpPtr->flags = flags; 00926 status = TclReComp(®expPtr->re, uniString, (size_t) numChars, flags); 00927 Tcl_DStringFree(&stringBuf); 00928 00929 if (status != REG_OKAY) { 00930 /* 00931 * Clean up and report errors in the interpreter, if possible. 00932 */ 00933 00934 ckfree((char *)regexpPtr); 00935 if (interp) { 00936 TclRegError(interp, 00937 "couldn't compile regular expression pattern: ", status); 00938 } 00939 return NULL; 00940 } 00941 00942 /* 00943 * Convert RE to a glob pattern equivalent, if any, and cache it. If this 00944 * is not possible, then globObjPtr will be NULL. This is used by 00945 * Tcl_RegExpExecObj to optionally do a fast match (avoids RE engine). 00946 */ 00947 00948 if (TclReToGlob(NULL, string, length, &stringBuf, &exact) == TCL_OK) { 00949 regexpPtr->globObjPtr = Tcl_NewStringObj(Tcl_DStringValue(&stringBuf), 00950 Tcl_DStringLength(&stringBuf)); 00951 Tcl_IncrRefCount(regexpPtr->globObjPtr); 00952 Tcl_DStringFree(&stringBuf); 00953 } else { 00954 regexpPtr->globObjPtr = NULL; 00955 } 00956 00957 /* 00958 * Allocate enough space for all of the subexpressions, plus one extra for 00959 * the entire pattern. 00960 */ 00961 00962 regexpPtr->matches = (regmatch_t *) ckalloc( 00963 sizeof(regmatch_t) * (regexpPtr->re.re_nsub + 1)); 00964 00965 /* 00966 * Initialize the refcount to one initially, since it is in the cache. 00967 */ 00968 00969 regexpPtr->refCount = 1; 00970 00971 /* 00972 * Free the last regexp, if necessary, and make room at the head of the 00973 * list for the new regexp. 00974 */ 00975 00976 if (tsdPtr->patterns[NUM_REGEXPS-1] != NULL) { 00977 TclRegexp *oldRegexpPtr = tsdPtr->regexps[NUM_REGEXPS-1]; 00978 if (--(oldRegexpPtr->refCount) <= 0) { 00979 FreeRegexp(oldRegexpPtr); 00980 } 00981 ckfree(tsdPtr->patterns[NUM_REGEXPS-1]); 00982 } 00983 for (i = NUM_REGEXPS - 2; i >= 0; i--) { 00984 tsdPtr->patterns[i+1] = tsdPtr->patterns[i]; 00985 tsdPtr->patLengths[i+1] = tsdPtr->patLengths[i]; 00986 tsdPtr->regexps[i+1] = tsdPtr->regexps[i]; 00987 } 00988 tsdPtr->patterns[0] = (char *) ckalloc((unsigned) (length+1)); 00989 strcpy(tsdPtr->patterns[0], string); 00990 tsdPtr->patLengths[0] = length; 00991 tsdPtr->regexps[0] = regexpPtr; 00992 00993 return regexpPtr; 00994 } 00995 00996 /* 00997 *---------------------------------------------------------------------- 00998 * 00999 * FreeRegexp -- 01000 * 01001 * Release the storage associated with a TclRegexp. 01002 * 01003 * Results: 01004 * None. 01005 * 01006 * Side effects: 01007 * None. 01008 * 01009 *---------------------------------------------------------------------- 01010 */ 01011 01012 static void 01013 FreeRegexp( 01014 TclRegexp *regexpPtr) /* Compiled regular expression to free. */ 01015 { 01016 TclReFree(®expPtr->re); 01017 if (regexpPtr->globObjPtr) { 01018 TclDecrRefCount(regexpPtr->globObjPtr); 01019 } 01020 if (regexpPtr->matches) { 01021 ckfree((char *) regexpPtr->matches); 01022 } 01023 ckfree((char *) regexpPtr); 01024 } 01025 01026 /* 01027 *---------------------------------------------------------------------- 01028 * 01029 * FinalizeRegexp -- 01030 * 01031 * Release the storage associated with the per-thread regexp cache. 01032 * 01033 * Results: 01034 * None. 01035 * 01036 * Side effects: 01037 * None. 01038 * 01039 *---------------------------------------------------------------------- 01040 */ 01041 01042 static void 01043 FinalizeRegexp( 01044 ClientData clientData) /* Not used. */ 01045 { 01046 int i; 01047 TclRegexp *regexpPtr; 01048 ThreadSpecificData *tsdPtr = TCL_TSD_INIT(&dataKey); 01049 01050 for (i = 0; (i < NUM_REGEXPS) && (tsdPtr->patterns[i] != NULL); i++) { 01051 regexpPtr = tsdPtr->regexps[i]; 01052 if (--(regexpPtr->refCount) <= 0) { 01053 FreeRegexp(regexpPtr); 01054 } 01055 ckfree(tsdPtr->patterns[i]); 01056 tsdPtr->patterns[i] = NULL; 01057 } 01058 /* 01059 * We may find ourselves reinitialized if another finalization routine 01060 * invokes regexps. 01061 */ 01062 tsdPtr->initialized = 0; 01063 } 01064 01065 /* 01066 * Local Variables: 01067 * mode: c 01068 * c-basic-offset: 4 01069 * fill-column: 78 01070 * End: 01071 */
Generated on Wed Mar 12 12:18:21 2008 by 1.5.1 |