tclUtf.cGo to the documentation of this file.00001 /* 00002 * tclUtf.c -- 00003 * 00004 * Routines for manipulating UTF-8 strings. 00005 * 00006 * Copyright (c) 1997-1998 Sun Microsystems, Inc. 00007 * 00008 * See the file "license.terms" for information on usage and redistribution of 00009 * this file, and for a DISCLAIMER OF ALL WARRANTIES. 00010 * 00011 * RCS: @(#) $Id: tclUtf.c,v 1.37 2005/10/31 15:59:41 dkf Exp $ 00012 */ 00013 00014 #include "tclInt.h" 00015 00016 /* 00017 * Include the static character classification tables and macros. 00018 */ 00019 00020 #include "tclUniData.c" 00021 00022 /* 00023 * The following macros are used for fast character category tests. The x_BITS 00024 * values are shifted right by the category value to determine whether the 00025 * given category is included in the set. 00026 */ 00027 00028 #define ALPHA_BITS ((1 << UPPERCASE_LETTER) | (1 << LOWERCASE_LETTER) \ 00029 | (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1<<OTHER_LETTER)) 00030 00031 #define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER) 00032 00033 #define SPACE_BITS ((1 << SPACE_SEPARATOR) | (1 << LINE_SEPARATOR) \ 00034 | (1 << PARAGRAPH_SEPARATOR)) 00035 00036 #define CONNECTOR_BITS (1 << CONNECTOR_PUNCTUATION) 00037 00038 #define PRINT_BITS (ALPHA_BITS | DIGIT_BITS | SPACE_BITS | \ 00039 (1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \ 00040 (1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \ 00041 (1 << OTHER_NUMBER) | (1 << CONNECTOR_PUNCTUATION) | \ 00042 (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \ 00043 (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \ 00044 (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION) | \ 00045 (1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \ 00046 (1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL)) 00047 00048 #define PUNCT_BITS ((1 << CONNECTOR_PUNCTUATION) | \ 00049 (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \ 00050 (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \ 00051 (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION)) 00052 00053 /* 00054 * Unicode characters less than this value are represented by themselves in 00055 * UTF-8 strings. 00056 */ 00057 00058 #define UNICODE_SELF 0x80 00059 00060 /* 00061 * The following structures are used when mapping between Unicode (UCS-2) and 00062 * UTF-8. 00063 */ 00064 00065 static CONST unsigned char totalBytes[256] = { 00066 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 00067 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 00068 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 00069 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 00070 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 00071 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 00072 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 00073 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 00074 #if TCL_UTF_MAX > 3 00075 4,4,4,4,4,4,4,4, 00076 #else 00077 1,1,1,1,1,1,1,1, 00078 #endif 00079 #if TCL_UTF_MAX > 4 00080 5,5,5,5, 00081 #else 00082 1,1,1,1, 00083 #endif 00084 #if TCL_UTF_MAX > 5 00085 6,6,6,6 00086 #else 00087 1,1,1,1 00088 #endif 00089 }; 00090 00091 /* 00092 * Functions used only in this module. 00093 */ 00094 00095 static int UtfCount(int ch); 00096 00097 /* 00098 *--------------------------------------------------------------------------- 00099 * 00100 * UtfCount -- 00101 * 00102 * Find the number of bytes in the Utf character "ch". 00103 * 00104 * Results: 00105 * The return values is the number of bytes in the Utf character "ch". 00106 * 00107 * Side effects: 00108 * None. 00109 * 00110 *--------------------------------------------------------------------------- 00111 */ 00112 00113 INLINE static int 00114 UtfCount( 00115 int ch) /* The Tcl_UniChar whose size is returned. */ 00116 { 00117 if ((ch > 0) && (ch < UNICODE_SELF)) { 00118 return 1; 00119 } 00120 if (ch <= 0x7FF) { 00121 return 2; 00122 } 00123 if (ch <= 0xFFFF) { 00124 return 3; 00125 } 00126 #if TCL_UTF_MAX > 3 00127 if (ch <= 0x1FFFFF) { 00128 return 4; 00129 } 00130 if (ch <= 0x3FFFFFF) { 00131 return 5; 00132 } 00133 if (ch <= 0x7FFFFFFF) { 00134 return 6; 00135 } 00136 #endif 00137 return 3; 00138 } 00139 00140 /* 00141 *--------------------------------------------------------------------------- 00142 * 00143 * Tcl_UniCharToUtf -- 00144 * 00145 * Store the given Tcl_UniChar as a sequence of UTF-8 bytes in the 00146 * provided buffer. Equivalent to Plan 9 runetochar(). 00147 * 00148 * Results: 00149 * The return values is the number of bytes in the buffer that were 00150 * consumed. 00151 * 00152 * Side effects: 00153 * None. 00154 * 00155 *--------------------------------------------------------------------------- 00156 */ 00157 00158 INLINE int 00159 Tcl_UniCharToUtf( 00160 int ch, /* The Tcl_UniChar to be stored in the 00161 * buffer. */ 00162 char *buf) /* Buffer in which the UTF-8 representation of 00163 * the Tcl_UniChar is stored. Buffer must be 00164 * large enough to hold the UTF-8 character 00165 * (at most TCL_UTF_MAX bytes). */ 00166 { 00167 if ((ch > 0) && (ch < UNICODE_SELF)) { 00168 buf[0] = (char) ch; 00169 return 1; 00170 } 00171 if (ch >= 0) { 00172 if (ch <= 0x7FF) { 00173 buf[1] = (char) ((ch | 0x80) & 0xBF); 00174 buf[0] = (char) ((ch >> 6) | 0xC0); 00175 return 2; 00176 } 00177 if (ch <= 0xFFFF) { 00178 three: 00179 buf[2] = (char) ((ch | 0x80) & 0xBF); 00180 buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF); 00181 buf[0] = (char) ((ch >> 12) | 0xE0); 00182 return 3; 00183 } 00184 00185 #if TCL_UTF_MAX > 3 00186 if (ch <= 0x1FFFFF) { 00187 buf[3] = (char) ((ch | 0x80) & 0xBF); 00188 buf[2] = (char) (((ch >> 6) | 0x80) & 0xBF); 00189 buf[1] = (char) (((ch >> 12) | 0x80) & 0xBF); 00190 buf[0] = (char) ((ch >> 18) | 0xF0); 00191 return 4; 00192 } 00193 if (ch <= 0x3FFFFFF) { 00194 buf[4] = (char) ((ch | 0x80) & 0xBF); 00195 buf[3] = (char) (((ch >> 6) | 0x80) & 0xBF); 00196 buf[2] = (char) (((ch >> 12) | 0x80) & 0xBF); 00197 buf[1] = (char) (((ch >> 18) | 0x80) & 0xBF); 00198 buf[0] = (char) ((ch >> 24) | 0xF8); 00199 return 5; 00200 } 00201 if (ch <= 0x7FFFFFFF) { 00202 buf[5] = (char) ((ch | 0x80) & 0xBF); 00203 buf[4] = (char) (((ch >> 6) | 0x80) & 0xBF); 00204 buf[3] = (char) (((ch >> 12) | 0x80) & 0xBF); 00205 buf[2] = (char) (((ch >> 18) | 0x80) & 0xBF); 00206 buf[1] = (char) (((ch >> 24) | 0x80) & 0xBF); 00207 buf[0] = (char) ((ch >> 30) | 0xFC); 00208 return 6; 00209 } 00210 #endif 00211 } 00212 00213 ch = 0xFFFD; 00214 goto three; 00215 } 00216 00217 /* 00218 *--------------------------------------------------------------------------- 00219 * 00220 * Tcl_UniCharToUtfDString -- 00221 * 00222 * Convert the given Unicode string to UTF-8. 00223 * 00224 * Results: 00225 * The return value is a pointer to the UTF-8 representation of the 00226 * Unicode string. Storage for the return value is appended to the end of 00227 * dsPtr. 00228 * 00229 * Side effects: 00230 * None. 00231 * 00232 *--------------------------------------------------------------------------- 00233 */ 00234 00235 char * 00236 Tcl_UniCharToUtfDString( 00237 CONST Tcl_UniChar *uniStr, /* Unicode string to convert to UTF-8. */ 00238 int uniLength, /* Length of Unicode string in Tcl_UniChars 00239 * (must be >= 0). */ 00240 Tcl_DString *dsPtr) /* UTF-8 representation of string is appended 00241 * to this previously initialized DString. */ 00242 { 00243 CONST Tcl_UniChar *w, *wEnd; 00244 char *p, *string; 00245 int oldLength; 00246 00247 /* 00248 * UTF-8 string length in bytes will be <= Unicode string length * 00249 * TCL_UTF_MAX. 00250 */ 00251 00252 oldLength = Tcl_DStringLength(dsPtr); 00253 Tcl_DStringSetLength(dsPtr, (oldLength + uniLength + 1) * TCL_UTF_MAX); 00254 string = Tcl_DStringValue(dsPtr) + oldLength; 00255 00256 p = string; 00257 wEnd = uniStr + uniLength; 00258 for (w = uniStr; w < wEnd; ) { 00259 p += Tcl_UniCharToUtf(*w, p); 00260 w++; 00261 } 00262 Tcl_DStringSetLength(dsPtr, oldLength + (p - string)); 00263 00264 return string; 00265 } 00266 00267 /* 00268 *--------------------------------------------------------------------------- 00269 * 00270 * Tcl_UtfToUniChar -- 00271 * 00272 * Extract the Tcl_UniChar represented by the UTF-8 string. Bad UTF-8 00273 * sequences are converted to valid Tcl_UniChars and processing 00274 * continues. Equivalent to Plan 9 chartorune(). 00275 * 00276 * The caller must ensure that the source buffer is long enough that this 00277 * routine does not run off the end and dereference non-existent memory 00278 * looking for trail bytes. If the source buffer is known to be '\0' 00279 * terminated, this cannot happen. Otherwise, the caller should call 00280 * Tcl_UtfCharComplete() before calling this routine to ensure that 00281 * enough bytes remain in the string. 00282 * 00283 * Results: 00284 * *chPtr is filled with the Tcl_UniChar, and the return value is the 00285 * number of bytes from the UTF-8 string that were consumed. 00286 * 00287 * Side effects: 00288 * None. 00289 * 00290 *--------------------------------------------------------------------------- 00291 */ 00292 00293 int 00294 Tcl_UtfToUniChar( 00295 register CONST char *src, /* The UTF-8 string. */ 00296 register Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by 00297 * the UTF-8 string. */ 00298 { 00299 register int byte; 00300 00301 /* 00302 * Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones. 00303 */ 00304 00305 byte = *((unsigned char *) src); 00306 if (byte < 0xC0) { 00307 /* 00308 * Handles properly formed UTF-8 characters between 0x01 and 0x7F. 00309 * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid 00310 * characters representing themselves. 00311 */ 00312 00313 *chPtr = (Tcl_UniChar) byte; 00314 return 1; 00315 } else if (byte < 0xE0) { 00316 if ((src[1] & 0xC0) == 0x80) { 00317 /* 00318 * Two-byte-character lead-byte followed by a trail-byte. 00319 */ 00320 00321 *chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (src[1] & 0x3F)); 00322 return 2; 00323 } 00324 00325 /* 00326 * A two-byte-character lead-byte not followed by trail-byte 00327 * represents itself. 00328 */ 00329 00330 *chPtr = (Tcl_UniChar) byte; 00331 return 1; 00332 } else if (byte < 0xF0) { 00333 if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) { 00334 /* 00335 * Three-byte-character lead byte followed by two trail bytes. 00336 */ 00337 00338 *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12) 00339 | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F)); 00340 return 3; 00341 } 00342 00343 /* 00344 * A three-byte-character lead-byte not followed by two trail-bytes 00345 * represents itself. 00346 */ 00347 00348 *chPtr = (Tcl_UniChar) byte; 00349 return 1; 00350 } 00351 #if TCL_UTF_MAX > 3 00352 { 00353 int ch, total, trail; 00354 00355 total = totalBytes[byte]; 00356 trail = total - 1; 00357 if (trail > 0) { 00358 ch = byte & (0x3F >> trail); 00359 do { 00360 src++; 00361 if ((*src & 0xC0) != 0x80) { 00362 *chPtr = byte; 00363 return 1; 00364 } 00365 ch <<= 6; 00366 ch |= (*src & 0x3F); 00367 trail--; 00368 } while (trail > 0); 00369 *chPtr = ch; 00370 return total; 00371 } 00372 } 00373 #endif 00374 00375 *chPtr = (Tcl_UniChar) byte; 00376 return 1; 00377 } 00378 00379 /* 00380 *--------------------------------------------------------------------------- 00381 * 00382 * Tcl_UtfToUniCharDString -- 00383 * 00384 * Convert the UTF-8 string to Unicode. 00385 * 00386 * Results: 00387 * The return value is a pointer to the Unicode representation of the 00388 * UTF-8 string. Storage for the return value is appended to the end of 00389 * dsPtr. The Unicode string is terminated with a Unicode NULL character. 00390 * 00391 * Side effects: 00392 * None. 00393 * 00394 *--------------------------------------------------------------------------- 00395 */ 00396 00397 Tcl_UniChar * 00398 Tcl_UtfToUniCharDString( 00399 CONST char *src, /* UTF-8 string to convert to Unicode. */ 00400 int length, /* Length of UTF-8 string in bytes, or -1 for 00401 * strlen(). */ 00402 Tcl_DString *dsPtr) /* Unicode representation of string is 00403 * appended to this previously initialized 00404 * DString. */ 00405 { 00406 Tcl_UniChar *w, *wString; 00407 CONST char *p, *end; 00408 int oldLength; 00409 00410 if (length < 0) { 00411 length = strlen(src); 00412 } 00413 00414 /* 00415 * Unicode string length in Tcl_UniChars will be <= UTF-8 string length in 00416 * bytes. 00417 */ 00418 00419 oldLength = Tcl_DStringLength(dsPtr); 00420 Tcl_DStringSetLength(dsPtr, 00421 (int) ((oldLength + length + 1) * sizeof(Tcl_UniChar))); 00422 wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength); 00423 00424 w = wString; 00425 end = src + length; 00426 for (p = src; p < end; ) { 00427 p += TclUtfToUniChar(p, w); 00428 w++; 00429 } 00430 *w = '\0'; 00431 Tcl_DStringSetLength(dsPtr, 00432 (oldLength + ((char *) w - (char *) wString))); 00433 00434 return wString; 00435 } 00436 00437 /* 00438 *--------------------------------------------------------------------------- 00439 * 00440 * Tcl_UtfCharComplete -- 00441 * 00442 * Determine if the UTF-8 string of the given length is long enough to be 00443 * decoded by Tcl_UtfToUniChar(). This does not ensure that the UTF-8 00444 * string is properly formed. Equivalent to Plan 9 fullrune(). 00445 * 00446 * Results: 00447 * The return value is 0 if the string is not long enough, non-zero 00448 * otherwise. 00449 * 00450 * Side effects: 00451 * None. 00452 * 00453 *--------------------------------------------------------------------------- 00454 */ 00455 00456 int 00457 Tcl_UtfCharComplete( 00458 CONST char *src, /* String to check if first few bytes contain 00459 * a complete UTF-8 character. */ 00460 int length) /* Length of above string in bytes. */ 00461 { 00462 int ch; 00463 00464 ch = *((unsigned char *) src); 00465 return length >= totalBytes[ch]; 00466 } 00467 00468 /* 00469 *--------------------------------------------------------------------------- 00470 * 00471 * Tcl_NumUtfChars -- 00472 * 00473 * Returns the number of characters (not bytes) in the UTF-8 string, not 00474 * including the terminating NULL byte. This is equivalent to Plan 9 00475 * utflen() and utfnlen(). 00476 * 00477 * Results: 00478 * As above. 00479 * 00480 * Side effects: 00481 * None. 00482 * 00483 *--------------------------------------------------------------------------- 00484 */ 00485 00486 int 00487 Tcl_NumUtfChars( 00488 register CONST char *src, /* The UTF-8 string to measure. */ 00489 int length) /* The length of the string in bytes, or -1 00490 * for strlen(string). */ 00491 { 00492 Tcl_UniChar ch; 00493 register Tcl_UniChar *chPtr = &ch; 00494 register int i; 00495 00496 /* 00497 * The separate implementations are faster. 00498 * 00499 * Since this is a time-sensitive function, we also do the check for the 00500 * single-byte char case specially. 00501 */ 00502 00503 i = 0; 00504 if (length < 0) { 00505 while (*src != '\0') { 00506 src += TclUtfToUniChar(src, chPtr); 00507 i++; 00508 } 00509 } else { 00510 register int n; 00511 00512 while (length > 0) { 00513 if (UCHAR(*src) < 0xC0) { 00514 length--; 00515 src++; 00516 } else { 00517 n = Tcl_UtfToUniChar(src, chPtr); 00518 length -= n; 00519 src += n; 00520 } 00521 i++; 00522 } 00523 } 00524 return i; 00525 } 00526 00527 /* 00528 *--------------------------------------------------------------------------- 00529 * 00530 * Tcl_UtfFindFirst -- 00531 * 00532 * Returns a pointer to the first occurance of the given Tcl_UniChar in 00533 * the NULL-terminated UTF-8 string. The NULL terminator is considered 00534 * part of the UTF-8 string. Equivalent to Plan 9 utfrune(). 00535 * 00536 * Results: 00537 * As above. If the Tcl_UniChar does not exist in the given string, the 00538 * return value is NULL. 00539 * 00540 * Side effects: 00541 * None. 00542 * 00543 *--------------------------------------------------------------------------- 00544 */ 00545 00546 CONST char * 00547 Tcl_UtfFindFirst( 00548 CONST char *src, /* The UTF-8 string to be searched. */ 00549 int ch) /* The Tcl_UniChar to search for. */ 00550 { 00551 int len; 00552 Tcl_UniChar find; 00553 00554 while (1) { 00555 len = TclUtfToUniChar(src, &find); 00556 if (find == ch) { 00557 return src; 00558 } 00559 if (*src == '\0') { 00560 return NULL; 00561 } 00562 src += len; 00563 } 00564 } 00565 00566 /* 00567 *--------------------------------------------------------------------------- 00568 * 00569 * Tcl_UtfFindLast -- 00570 * 00571 * Returns a pointer to the last occurance of the given Tcl_UniChar in 00572 * the NULL-terminated UTF-8 string. The NULL terminator is considered 00573 * part of the UTF-8 string. Equivalent to Plan 9 utfrrune(). 00574 * 00575 * Results: 00576 * As above. If the Tcl_UniChar does not exist in the given string, the 00577 * return value is NULL. 00578 * 00579 * Side effects: 00580 * None. 00581 * 00582 *--------------------------------------------------------------------------- 00583 */ 00584 00585 CONST char * 00586 Tcl_UtfFindLast( 00587 CONST char *src, /* The UTF-8 string to be searched. */ 00588 int ch) /* The Tcl_UniChar to search for. */ 00589 { 00590 int len; 00591 Tcl_UniChar find; 00592 CONST char *last; 00593 00594 last = NULL; 00595 while (1) { 00596 len = TclUtfToUniChar(src, &find); 00597 if (find == ch) { 00598 last = src; 00599 } 00600 if (*src == '\0') { 00601 break; 00602 } 00603 src += len; 00604 } 00605 return last; 00606 } 00607 00608 /* 00609 *--------------------------------------------------------------------------- 00610 * 00611 * Tcl_UtfNext -- 00612 * 00613 * Given a pointer to some current location in a UTF-8 string, move 00614 * forward one character. The caller must ensure that they are not asking 00615 * for the next character after the last character in the string. 00616 * 00617 * Results: 00618 * The return value is the pointer to the next character in the UTF-8 00619 * string. 00620 * 00621 * Side effects: 00622 * None. 00623 * 00624 *--------------------------------------------------------------------------- 00625 */ 00626 00627 CONST char * 00628 Tcl_UtfNext( 00629 CONST char *src) /* The current location in the string. */ 00630 { 00631 Tcl_UniChar ch; 00632 00633 return src + TclUtfToUniChar(src, &ch); 00634 } 00635 00636 /* 00637 *--------------------------------------------------------------------------- 00638 * 00639 * Tcl_UtfPrev -- 00640 * 00641 * Given a pointer to some current location in a UTF-8 string, move 00642 * backwards one character. This works correctly when the pointer is in 00643 * the middle of a UTF-8 character. 00644 * 00645 * Results: 00646 * The return value is a pointer to the previous character in the UTF-8 00647 * string. If the current location was already at the beginning of the 00648 * string, the return value will also be a pointer to the beginning of 00649 * the string. 00650 * 00651 * Side effects: 00652 * None. 00653 * 00654 *--------------------------------------------------------------------------- 00655 */ 00656 00657 CONST char * 00658 Tcl_UtfPrev( 00659 CONST char *src, /* The current location in the string. */ 00660 CONST char *start) /* Pointer to the beginning of the string, to 00661 * avoid going backwards too far. */ 00662 { 00663 CONST char *look; 00664 int i, byte; 00665 00666 src--; 00667 look = src; 00668 for (i = 0; i < TCL_UTF_MAX; i++) { 00669 if (look < start) { 00670 if (src < start) { 00671 src = start; 00672 } 00673 break; 00674 } 00675 byte = *((unsigned char *) look); 00676 if (byte < 0x80) { 00677 break; 00678 } 00679 if (byte >= 0xC0) { 00680 return look; 00681 } 00682 look--; 00683 } 00684 return src; 00685 } 00686 00687 /* 00688 *--------------------------------------------------------------------------- 00689 * 00690 * Tcl_UniCharAtIndex -- 00691 * 00692 * Returns the Unicode character represented at the specified character 00693 * (not byte) position in the UTF-8 string. 00694 * 00695 * Results: 00696 * As above. 00697 * 00698 * Side effects: 00699 * None. 00700 * 00701 *--------------------------------------------------------------------------- 00702 */ 00703 00704 Tcl_UniChar 00705 Tcl_UniCharAtIndex( 00706 register CONST char *src, /* The UTF-8 string to dereference. */ 00707 register int index) /* The position of the desired character. */ 00708 { 00709 Tcl_UniChar ch; 00710 00711 while (index >= 0) { 00712 index--; 00713 src += TclUtfToUniChar(src, &ch); 00714 } 00715 return ch; 00716 } 00717 00718 /* 00719 *--------------------------------------------------------------------------- 00720 * 00721 * Tcl_UtfAtIndex -- 00722 * 00723 * Returns a pointer to the specified character (not byte) position in 00724 * the UTF-8 string. 00725 * 00726 * Results: 00727 * As above. 00728 * 00729 * Side effects: 00730 * None. 00731 * 00732 *--------------------------------------------------------------------------- 00733 */ 00734 00735 CONST char * 00736 Tcl_UtfAtIndex( 00737 register CONST char *src, /* The UTF-8 string. */ 00738 register int index) /* The position of the desired character. */ 00739 { 00740 Tcl_UniChar ch; 00741 00742 while (index > 0) { 00743 index--; 00744 src += TclUtfToUniChar(src, &ch); 00745 } 00746 return src; 00747 } 00748 00749 /* 00750 *--------------------------------------------------------------------------- 00751 * 00752 * Tcl_UtfBackslash -- 00753 * 00754 * Figure out how to handle a backslash sequence. 00755 * 00756 * Results: 00757 * Stores the bytes represented by the backslash sequence in dst and 00758 * returns the number of bytes written to dst. At most TCL_UTF_MAX bytes 00759 * are written to dst; dst must have been large enough to accept those 00760 * bytes. If readPtr isn't NULL then it is filled in with a count of the 00761 * number of bytes in the backslash sequence. 00762 * 00763 * Side effects: 00764 * The maximum number of bytes it takes to represent a Unicode character 00765 * in UTF-8 is guaranteed to be less than the number of bytes used to 00766 * express the backslash sequence that represents that Unicode character. 00767 * If the target buffer into which the caller is going to store the bytes 00768 * that represent the Unicode character is at least as large as the 00769 * source buffer from which the backslashed sequence was extracted, no 00770 * buffer overruns should occur. 00771 * 00772 *--------------------------------------------------------------------------- 00773 */ 00774 00775 int 00776 Tcl_UtfBackslash( 00777 CONST char *src, /* Points to the backslash character of a 00778 * backslash sequence. */ 00779 int *readPtr, /* Fill in with number of characters read from 00780 * src, unless NULL. */ 00781 char *dst) /* Filled with the bytes represented by the 00782 * backslash sequence. */ 00783 { 00784 #define LINE_LENGTH 128 00785 int numRead; 00786 int result; 00787 00788 result = TclParseBackslash(src, LINE_LENGTH, &numRead, dst); 00789 if (numRead == LINE_LENGTH) { 00790 /* 00791 * We ate a whole line. Pay the price of a strlen() 00792 */ 00793 00794 result = TclParseBackslash(src, (int)strlen(src), &numRead, dst); 00795 } 00796 if (readPtr != NULL) { 00797 *readPtr = numRead; 00798 } 00799 return result; 00800 } 00801 00802 /* 00803 *---------------------------------------------------------------------- 00804 * 00805 * Tcl_UtfToUpper -- 00806 * 00807 * Convert lowercase characters to uppercase characters in a UTF string 00808 * in place. The conversion may shrink the UTF string. 00809 * 00810 * Results: 00811 * Returns the number of bytes in the resulting string excluding the 00812 * trailing null. 00813 * 00814 * Side effects: 00815 * Writes a terminating null after the last converted character. 00816 * 00817 *---------------------------------------------------------------------- 00818 */ 00819 00820 int 00821 Tcl_UtfToUpper( 00822 char *str) /* String to convert in place. */ 00823 { 00824 Tcl_UniChar ch, upChar; 00825 char *src, *dst; 00826 int bytes; 00827 00828 /* 00829 * Iterate over the string until we hit the terminating null. 00830 */ 00831 00832 src = dst = str; 00833 while (*src) { 00834 bytes = TclUtfToUniChar(src, &ch); 00835 upChar = Tcl_UniCharToUpper(ch); 00836 00837 /* 00838 * To keep badly formed Utf strings from getting inflated by the 00839 * conversion (thereby causing a segfault), only copy the upper case 00840 * char to dst if its size is <= the original char. 00841 */ 00842 00843 if (bytes < UtfCount(upChar)) { 00844 memcpy(dst, src, (size_t) bytes); 00845 dst += bytes; 00846 } else { 00847 dst += Tcl_UniCharToUtf(upChar, dst); 00848 } 00849 src += bytes; 00850 } 00851 *dst = '\0'; 00852 return (dst - str); 00853 } 00854 00855 /* 00856 *---------------------------------------------------------------------- 00857 * 00858 * Tcl_UtfToLower -- 00859 * 00860 * Convert uppercase characters to lowercase characters in a UTF string 00861 * in place. The conversion may shrink the UTF string. 00862 * 00863 * Results: 00864 * Returns the number of bytes in the resulting string excluding the 00865 * trailing null. 00866 * 00867 * Side effects: 00868 * Writes a terminating null after the last converted character. 00869 * 00870 *---------------------------------------------------------------------- 00871 */ 00872 00873 int 00874 Tcl_UtfToLower( 00875 char *str) /* String to convert in place. */ 00876 { 00877 Tcl_UniChar ch, lowChar; 00878 char *src, *dst; 00879 int bytes; 00880 00881 /* 00882 * Iterate over the string until we hit the terminating null. 00883 */ 00884 00885 src = dst = str; 00886 while (*src) { 00887 bytes = TclUtfToUniChar(src, &ch); 00888 lowChar = Tcl_UniCharToLower(ch); 00889 00890 /* 00891 * To keep badly formed Utf strings from getting inflated by the 00892 * conversion (thereby causing a segfault), only copy the lower case 00893 * char to dst if its size is <= the original char. 00894 */ 00895 00896 if (bytes < UtfCount(lowChar)) { 00897 memcpy(dst, src, (size_t) bytes); 00898 dst += bytes; 00899 } else { 00900 dst += Tcl_UniCharToUtf(lowChar, dst); 00901 } 00902 src += bytes; 00903 } 00904 *dst = '\0'; 00905 return (dst - str); 00906 } 00907 00908 /* 00909 *---------------------------------------------------------------------- 00910 * 00911 * Tcl_UtfToTitle -- 00912 * 00913 * Changes the first character of a UTF string to title case or uppercase 00914 * and the rest of the string to lowercase. The conversion happens in 00915 * place and may shrink the UTF string. 00916 * 00917 * Results: 00918 * Returns the number of bytes in the resulting string excluding the 00919 * trailing null. 00920 * 00921 * Side effects: 00922 * Writes a terminating null after the last converted character. 00923 * 00924 *---------------------------------------------------------------------- 00925 */ 00926 00927 int 00928 Tcl_UtfToTitle( 00929 char *str) /* String to convert in place. */ 00930 { 00931 Tcl_UniChar ch, titleChar, lowChar; 00932 char *src, *dst; 00933 int bytes; 00934 00935 /* 00936 * Capitalize the first character and then lowercase the rest of the 00937 * characters until we get to a null. 00938 */ 00939 00940 src = dst = str; 00941 00942 if (*src) { 00943 bytes = TclUtfToUniChar(src, &ch); 00944 titleChar = Tcl_UniCharToTitle(ch); 00945 00946 if (bytes < UtfCount(titleChar)) { 00947 memcpy(dst, src, (size_t) bytes); 00948 dst += bytes; 00949 } else { 00950 dst += Tcl_UniCharToUtf(titleChar, dst); 00951 } 00952 src += bytes; 00953 } 00954 while (*src) { 00955 bytes = TclUtfToUniChar(src, &ch); 00956 lowChar = Tcl_UniCharToLower(ch); 00957 00958 if (bytes < UtfCount(lowChar)) { 00959 memcpy(dst, src, (size_t) bytes); 00960 dst += bytes; 00961 } else { 00962 dst += Tcl_UniCharToUtf(lowChar, dst); 00963 } 00964 src += bytes; 00965 } 00966 *dst = '\0'; 00967 return (dst - str); 00968 } 00969 00970 /* 00971 *---------------------------------------------------------------------- 00972 * 00973 * TclpUtfNcmp2 -- 00974 * 00975 * Compare at most numBytes bytes of utf-8 strings cs and ct. Both cs and 00976 * ct are assumed to be at least numBytes bytes long. 00977 * 00978 * Results: 00979 * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. 00980 * 00981 * Side effects: 00982 * None. 00983 * 00984 *---------------------------------------------------------------------- 00985 */ 00986 00987 int 00988 TclpUtfNcmp2( 00989 CONST char *cs, /* UTF string to compare to ct. */ 00990 CONST char *ct, /* UTF string cs is compared to. */ 00991 unsigned long numBytes) /* Number of *bytes* to compare. */ 00992 { 00993 /* 00994 * We can't simply call 'memcmp(cs, ct, numBytes);' because we need to 00995 * check for Tcl's \xC0\x80 non-utf-8 null encoding. Otherwise utf-8 lexes 00996 * fine in the strcmp manner. 00997 */ 00998 00999 register int result = 0; 01000 01001 for ( ; numBytes != 0; numBytes--, cs++, ct++) { 01002 if (*cs != *ct) { 01003 result = UCHAR(*cs) - UCHAR(*ct); 01004 break; 01005 } 01006 } 01007 if (numBytes && ((UCHAR(*cs) == 0xC0) || (UCHAR(*ct) == 0xC0))) { 01008 unsigned char c1, c2; 01009 01010 c1 = ((UCHAR(*cs) == 0xC0) && (UCHAR(cs[1]) == 0x80)) ? 0 : UCHAR(*cs); 01011 c2 = ((UCHAR(*ct) == 0xC0) && (UCHAR(ct[1]) == 0x80)) ? 0 : UCHAR(*ct); 01012 result = (c1 - c2); 01013 } 01014 return result; 01015 } 01016 01017 /* 01018 *---------------------------------------------------------------------- 01019 * 01020 * Tcl_UtfNcmp -- 01021 * 01022 * Compare at most numChars UTF chars of string cs to string ct. Both cs 01023 * and ct are assumed to be at least numChars UTF chars long. 01024 * 01025 * Results: 01026 * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. 01027 * 01028 * Side effects: 01029 * None. 01030 * 01031 *---------------------------------------------------------------------- 01032 */ 01033 01034 int 01035 Tcl_UtfNcmp( 01036 CONST char *cs, /* UTF string to compare to ct. */ 01037 CONST char *ct, /* UTF string cs is compared to. */ 01038 unsigned long numChars) /* Number of UTF chars to compare. */ 01039 { 01040 Tcl_UniChar ch1, ch2; 01041 01042 /* 01043 * Cannot use 'memcmp(cs, ct, n);' as byte representation of \u0000 (the 01044 * pair of bytes 0xc0,0x80) is larger than byte representation of \u0001 01045 * (the byte 0x01.) 01046 */ 01047 01048 while (numChars-- > 0) { 01049 /* 01050 * n must be interpreted as chars, not bytes. This should be called 01051 * only when both strings are of at least n chars long (no need for \0 01052 * check) 01053 */ 01054 01055 cs += TclUtfToUniChar(cs, &ch1); 01056 ct += TclUtfToUniChar(ct, &ch2); 01057 if (ch1 != ch2) { 01058 return (ch1 - ch2); 01059 } 01060 } 01061 return 0; 01062 } 01063 01064 /* 01065 *---------------------------------------------------------------------- 01066 * 01067 * Tcl_UtfNcasecmp -- 01068 * 01069 * Compare at most numChars UTF chars of string cs to string ct case 01070 * insensitive. Both cs and ct are assumed to be at least numChars UTF 01071 * chars long. 01072 * 01073 * Results: 01074 * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. 01075 * 01076 * Side effects: 01077 * None. 01078 * 01079 *---------------------------------------------------------------------- 01080 */ 01081 01082 int 01083 Tcl_UtfNcasecmp( 01084 CONST char *cs, /* UTF string to compare to ct. */ 01085 CONST char *ct, /* UTF string cs is compared to. */ 01086 unsigned long numChars) /* Number of UTF chars to compare. */ 01087 { 01088 Tcl_UniChar ch1, ch2; 01089 while (numChars-- > 0) { 01090 /* 01091 * n must be interpreted as chars, not bytes. 01092 * This should be called only when both strings are of 01093 * at least n chars long (no need for \0 check) 01094 */ 01095 cs += TclUtfToUniChar(cs, &ch1); 01096 ct += TclUtfToUniChar(ct, &ch2); 01097 if (ch1 != ch2) { 01098 ch1 = Tcl_UniCharToLower(ch1); 01099 ch2 = Tcl_UniCharToLower(ch2); 01100 if (ch1 != ch2) { 01101 return (ch1 - ch2); 01102 } 01103 } 01104 } 01105 return 0; 01106 } 01107 01108 /* 01109 *---------------------------------------------------------------------- 01110 * 01111 * Tcl_UniCharToUpper -- 01112 * 01113 * Compute the uppercase equivalent of the given Unicode character. 01114 * 01115 * Results: 01116 * Returns the uppercase Unicode character. 01117 * 01118 * Side effects: 01119 * None. 01120 * 01121 *---------------------------------------------------------------------- 01122 */ 01123 01124 Tcl_UniChar 01125 Tcl_UniCharToUpper( 01126 int ch) /* Unicode character to convert. */ 01127 { 01128 int info = GetUniCharInfo(ch); 01129 01130 if (GetCaseType(info) & 0x04) { 01131 return (Tcl_UniChar) (ch - GetDelta(info)); 01132 } else { 01133 return ch; 01134 } 01135 } 01136 01137 /* 01138 *---------------------------------------------------------------------- 01139 * 01140 * Tcl_UniCharToLower -- 01141 * 01142 * Compute the lowercase equivalent of the given Unicode character. 01143 * 01144 * Results: 01145 * Returns the lowercase Unicode character. 01146 * 01147 * Side effects: 01148 * None. 01149 * 01150 *---------------------------------------------------------------------- 01151 */ 01152 01153 Tcl_UniChar 01154 Tcl_UniCharToLower( 01155 int ch) /* Unicode character to convert. */ 01156 { 01157 int info = GetUniCharInfo(ch); 01158 01159 if (GetCaseType(info) & 0x02) { 01160 return (Tcl_UniChar) (ch + GetDelta(info)); 01161 } else { 01162 return ch; 01163 } 01164 } 01165 01166 /* 01167 *---------------------------------------------------------------------- 01168 * 01169 * Tcl_UniCharToTitle -- 01170 * 01171 * Compute the titlecase equivalent of the given Unicode character. 01172 * 01173 * Results: 01174 * Returns the titlecase Unicode character. 01175 * 01176 * Side effects: 01177 * None. 01178 * 01179 *---------------------------------------------------------------------- 01180 */ 01181 01182 Tcl_UniChar 01183 Tcl_UniCharToTitle( 01184 int ch) /* Unicode character to convert. */ 01185 { 01186 int info = GetUniCharInfo(ch); 01187 int mode = GetCaseType(info); 01188 01189 if (mode & 0x1) { 01190 /* 01191 * Subtract or add one depending on the original case. 01192 */ 01193 01194 return (Tcl_UniChar) (ch + ((mode & 0x4) ? -1 : 1)); 01195 } else if (mode == 0x4) { 01196 return (Tcl_UniChar) (ch - GetDelta(info)); 01197 } else { 01198 return ch; 01199 } 01200 } 01201 01202 /* 01203 *---------------------------------------------------------------------- 01204 * 01205 * Tcl_UniCharLen -- 01206 * 01207 * Find the length of a UniChar string. The str input must be null 01208 * terminated. 01209 * 01210 * Results: 01211 * Returns the length of str in UniChars (not bytes). 01212 * 01213 * Side effects: 01214 * None. 01215 * 01216 *---------------------------------------------------------------------- 01217 */ 01218 01219 int 01220 Tcl_UniCharLen( 01221 CONST Tcl_UniChar *uniStr) /* Unicode string to find length of. */ 01222 { 01223 int len = 0; 01224 01225 while (*uniStr != '\0') { 01226 len++; 01227 uniStr++; 01228 } 01229 return len; 01230 } 01231 01232 /* 01233 *---------------------------------------------------------------------- 01234 * 01235 * Tcl_UniCharNcmp -- 01236 * 01237 * Compare at most numChars unichars of string ucs to string uct. 01238 * Both ucs and uct are assumed to be at least numChars unichars long. 01239 * 01240 * Results: 01241 * Return <0 if ucs < uct, 0 if ucs == uct, or >0 if ucs > uct. 01242 * 01243 * Side effects: 01244 * None. 01245 * 01246 *---------------------------------------------------------------------- 01247 */ 01248 01249 int 01250 Tcl_UniCharNcmp( 01251 CONST Tcl_UniChar *ucs, /* Unicode string to compare to uct. */ 01252 CONST Tcl_UniChar *uct, /* Unicode string ucs is compared to. */ 01253 unsigned long numChars) /* Number of unichars to compare. */ 01254 { 01255 #ifdef WORDS_BIGENDIAN 01256 /* 01257 * We are definitely on a big-endian machine; memcmp() is safe 01258 */ 01259 01260 return memcmp(ucs, uct, numChars*sizeof(Tcl_UniChar)); 01261 01262 #else /* !WORDS_BIGENDIAN */ 01263 /* 01264 * We can't simply call memcmp() because that is not lexically correct. 01265 */ 01266 01267 for ( ; numChars != 0; ucs++, uct++, numChars--) { 01268 if (*ucs != *uct) { 01269 return (*ucs - *uct); 01270 } 01271 } 01272 return 0; 01273 #endif /* WORDS_BIGENDIAN */ 01274 } 01275 01276 /* 01277 *---------------------------------------------------------------------- 01278 * 01279 * Tcl_UniCharNcasecmp -- 01280 * 01281 * Compare at most numChars unichars of string ucs to string uct case 01282 * insensitive. Both ucs and uct are assumed to be at least numChars 01283 * unichars long. 01284 * 01285 * Results: 01286 * Return <0 if ucs < uct, 0 if ucs == uct, or >0 if ucs > uct. 01287 * 01288 * Side effects: 01289 * None. 01290 * 01291 *---------------------------------------------------------------------- 01292 */ 01293 01294 int 01295 Tcl_UniCharNcasecmp( 01296 CONST Tcl_UniChar *ucs, /* Unicode string to compare to uct. */ 01297 CONST Tcl_UniChar *uct, /* Unicode string ucs is compared to. */ 01298 unsigned long numChars) /* Number of unichars to compare. */ 01299 { 01300 for ( ; numChars != 0; numChars--, ucs++, uct++) { 01301 if (*ucs != *uct) { 01302 Tcl_UniChar lcs = Tcl_UniCharToLower(*ucs); 01303 Tcl_UniChar lct = Tcl_UniCharToLower(*uct); 01304 01305 if (lcs != lct) { 01306 return (lcs - lct); 01307 } 01308 } 01309 } 01310 return 0; 01311 } 01312 01313 /* 01314 *---------------------------------------------------------------------- 01315 * 01316 * Tcl_UniCharIsAlnum -- 01317 * 01318 * Test if a character is an alphanumeric Unicode character. 01319 * 01320 * Results: 01321 * Returns 1 if character is alphanumeric. 01322 * 01323 * Side effects: 01324 * None. 01325 * 01326 *---------------------------------------------------------------------- 01327 */ 01328 01329 int 01330 Tcl_UniCharIsAlnum( 01331 int ch) /* Unicode character to test. */ 01332 { 01333 register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); 01334 01335 return (((ALPHA_BITS | DIGIT_BITS) >> category) & 1); 01336 } 01337 01338 /* 01339 *---------------------------------------------------------------------- 01340 * 01341 * Tcl_UniCharIsAlpha -- 01342 * 01343 * Test if a character is an alphabetic Unicode character. 01344 * 01345 * Results: 01346 * Returns 1 if character is alphabetic. 01347 * 01348 * Side effects: 01349 * None. 01350 * 01351 *---------------------------------------------------------------------- 01352 */ 01353 01354 int 01355 Tcl_UniCharIsAlpha( 01356 int ch) /* Unicode character to test. */ 01357 { 01358 register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); 01359 return ((ALPHA_BITS >> category) & 1); 01360 } 01361 01362 /* 01363 *---------------------------------------------------------------------- 01364 * 01365 * Tcl_UniCharIsControl -- 01366 * 01367 * Test if a character is a Unicode control character. 01368 * 01369 * Results: 01370 * Returns non-zero if character is a control. 01371 * 01372 * Side effects: 01373 * None. 01374 * 01375 *---------------------------------------------------------------------- 01376 */ 01377 01378 int 01379 Tcl_UniCharIsControl( 01380 int ch) /* Unicode character to test. */ 01381 { 01382 return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == CONTROL); 01383 } 01384 01385 /* 01386 *---------------------------------------------------------------------- 01387 * 01388 * Tcl_UniCharIsDigit -- 01389 * 01390 * Test if a character is a numeric Unicode character. 01391 * 01392 * Results: 01393 * Returns non-zero if character is a digit. 01394 * 01395 * Side effects: 01396 * None. 01397 * 01398 *---------------------------------------------------------------------- 01399 */ 01400 01401 int 01402 Tcl_UniCharIsDigit( 01403 int ch) /* Unicode character to test. */ 01404 { 01405 return (GetUniCharInfo(ch)&UNICODE_CATEGORY_MASK) == DECIMAL_DIGIT_NUMBER; 01406 } 01407 01408 /* 01409 *---------------------------------------------------------------------- 01410 * 01411 * Tcl_UniCharIsGraph -- 01412 * 01413 * Test if a character is any Unicode print character except space. 01414 * 01415 * Results: 01416 * Returns non-zero if character is printable, but not space. 01417 * 01418 * Side effects: 01419 * None. 01420 * 01421 *---------------------------------------------------------------------- 01422 */ 01423 01424 int 01425 Tcl_UniCharIsGraph( 01426 int ch) /* Unicode character to test. */ 01427 { 01428 register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); 01429 return (((PRINT_BITS >> category) & 1) && ((unsigned char) ch != ' ')); 01430 } 01431 01432 /* 01433 *---------------------------------------------------------------------- 01434 * 01435 * Tcl_UniCharIsLower -- 01436 * 01437 * Test if a character is a lowercase Unicode character. 01438 * 01439 * Results: 01440 * Returns non-zero if character is lowercase. 01441 * 01442 * Side effects: 01443 * None. 01444 * 01445 *---------------------------------------------------------------------- 01446 */ 01447 01448 int 01449 Tcl_UniCharIsLower( 01450 int ch) /* Unicode character to test. */ 01451 { 01452 return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == LOWERCASE_LETTER); 01453 } 01454 01455 /* 01456 *---------------------------------------------------------------------- 01457 * 01458 * Tcl_UniCharIsPrint -- 01459 * 01460 * Test if a character is a Unicode print character. 01461 * 01462 * Results: 01463 * Returns non-zero if character is printable. 01464 * 01465 * Side effects: 01466 * None. 01467 * 01468 *---------------------------------------------------------------------- 01469 */ 01470 01471 int 01472 Tcl_UniCharIsPrint( 01473 int ch) /* Unicode character to test. */ 01474 { 01475 register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); 01476 return ((PRINT_BITS >> category) & 1); 01477 } 01478 01479 /* 01480 *---------------------------------------------------------------------- 01481 * 01482 * Tcl_UniCharIsPunct -- 01483 * 01484 * Test if a character is a Unicode punctuation character. 01485 * 01486 * Results: 01487 * Returns non-zero if character is punct. 01488 * 01489 * Side effects: 01490 * None. 01491 * 01492 *---------------------------------------------------------------------- 01493 */ 01494 01495 int 01496 Tcl_UniCharIsPunct( 01497 int ch) /* Unicode character to test. */ 01498 { 01499 register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); 01500 return ((PUNCT_BITS >> category) & 1); 01501 } 01502 01503 /* 01504 *---------------------------------------------------------------------- 01505 * 01506 * Tcl_UniCharIsSpace -- 01507 * 01508 * Test if a character is a whitespace Unicode character. 01509 * 01510 * Results: 01511 * Returns non-zero if character is a space. 01512 * 01513 * Side effects: 01514 * None. 01515 * 01516 *---------------------------------------------------------------------- 01517 */ 01518 01519 int 01520 Tcl_UniCharIsSpace( 01521 int ch) /* Unicode character to test. */ 01522 { 01523 register int category; 01524 01525 /* 01526 * If the character is within the first 127 characters, just use the 01527 * standard C function, otherwise consult the Unicode table. 01528 */ 01529 01530 if (ch < 0x80) { 01531 return isspace(UCHAR(ch)); /* INTL: ISO space */ 01532 } else { 01533 category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); 01534 return ((SPACE_BITS >> category) & 1); 01535 } 01536 } 01537 01538 /* 01539 *---------------------------------------------------------------------- 01540 * 01541 * Tcl_UniCharIsUpper -- 01542 * 01543 * Test if a character is a uppercase Unicode character. 01544 * 01545 * Results: 01546 * Returns non-zero if character is uppercase. 01547 * 01548 * Side effects: 01549 * None. 01550 * 01551 *---------------------------------------------------------------------- 01552 */ 01553 01554 int 01555 Tcl_UniCharIsUpper( 01556 int ch) /* Unicode character to test. */ 01557 { 01558 return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == UPPERCASE_LETTER); 01559 } 01560 01561 /* 01562 *---------------------------------------------------------------------- 01563 * 01564 * Tcl_UniCharIsWordChar -- 01565 * 01566 * Test if a character is alphanumeric or a connector punctuation mark. 01567 * 01568 * Results: 01569 * Returns 1 if character is a word character. 01570 * 01571 * Side effects: 01572 * None. 01573 * 01574 *---------------------------------------------------------------------- 01575 */ 01576 01577 int 01578 Tcl_UniCharIsWordChar( 01579 int ch) /* Unicode character to test. */ 01580 { 01581 register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK); 01582 01583 return (((ALPHA_BITS | DIGIT_BITS | CONNECTOR_BITS) >> category) & 1); 01584 } 01585 01586 /* 01587 *---------------------------------------------------------------------- 01588 * 01589 * Tcl_UniCharCaseMatch -- 01590 * 01591 * See if a particular Unicode string matches a particular pattern. 01592 * Allows case insensitivity. This is the Unicode equivalent of the char* 01593 * Tcl_StringCaseMatch. The UniChar strings must be NULL-terminated. 01594 * This has no provision for counted UniChar strings, thus should not be 01595 * used where NULLs are expected in the UniChar string. Use 01596 * TclUniCharMatch where possible. 01597 * 01598 * Results: 01599 * The return value is 1 if string matches pattern, and 0 otherwise. The 01600 * matching operation permits the following special characters in the 01601 * pattern: *?\[] (see the manual entry for details on what these mean). 01602 * 01603 * Side effects: 01604 * None. 01605 * 01606 *---------------------------------------------------------------------- 01607 */ 01608 01609 int 01610 Tcl_UniCharCaseMatch( 01611 CONST Tcl_UniChar *uniStr, /* Unicode String. */ 01612 CONST Tcl_UniChar *uniPattern, 01613 /* Pattern, which may contain special 01614 * characters. */ 01615 int nocase) /* 0 for case sensitive, 1 for insensitive */ 01616 { 01617 Tcl_UniChar ch1, p; 01618 01619 while (1) { 01620 p = *uniPattern; 01621 01622 /* 01623 * See if we're at the end of both the pattern and the string. If so, 01624 * we succeeded. If we're at the end of the pattern but not at the end 01625 * of the string, we failed. 01626 */ 01627 01628 if (p == 0) { 01629 return (*uniStr == 0); 01630 } 01631 if ((*uniStr == 0) && (p != '*')) { 01632 return 0; 01633 } 01634 01635 /* 01636 * Check for a "*" as the next pattern character. It matches any 01637 * substring. We handle this by skipping all the characters up to the 01638 * next matching one in the pattern, and then calling ourselves 01639 * recursively for each postfix of string, until either we match or we 01640 * reach the end of the string. 01641 */ 01642 01643 if (p == '*') { 01644 /* 01645 * Skip all successive *'s in the pattern 01646 */ 01647 01648 while (*(++uniPattern) == '*') { 01649 /* empty body */ 01650 } 01651 p = *uniPattern; 01652 if (p == 0) { 01653 return 1; 01654 } 01655 if (nocase) { 01656 p = Tcl_UniCharToLower(p); 01657 } 01658 while (1) { 01659 /* 01660 * Optimization for matching - cruise through the string 01661 * quickly if the next char in the pattern isn't a special 01662 * character 01663 */ 01664 01665 if ((p != '[') && (p != '?') && (p != '\\')) { 01666 if (nocase) { 01667 while (*uniStr && (p != *uniStr) 01668 && (p != Tcl_UniCharToLower(*uniStr))) { 01669 uniStr++; 01670 } 01671 } else { 01672 while (*uniStr && (p != *uniStr)) { 01673 uniStr++; 01674 } 01675 } 01676 } 01677 if (Tcl_UniCharCaseMatch(uniStr, uniPattern, nocase)) { 01678 return 1; 01679 } 01680 if (*uniStr == 0) { 01681 return 0; 01682 } 01683 uniStr++; 01684 } 01685 } 01686 01687 /* 01688 * Check for a "?" as the next pattern character. It matches any 01689 * single character. 01690 */ 01691 01692 if (p == '?') { 01693 uniPattern++; 01694 uniStr++; 01695 continue; 01696 } 01697 01698 /* 01699 * Check for a "[" as the next pattern character. It is followed by a 01700 * list of characters that are acceptable, or by a range (two 01701 * characters separated by "-"). 01702 */ 01703 01704 if (p == '[') { 01705 Tcl_UniChar startChar, endChar; 01706 01707 uniPattern++; 01708 ch1 = (nocase ? Tcl_UniCharToLower(*uniStr) : *uniStr); 01709 uniStr++; 01710 while (1) { 01711 if ((*uniPattern == ']') || (*uniPattern == 0)) { 01712 return 0; 01713 } 01714 startChar = (nocase ? Tcl_UniCharToLower(*uniPattern) 01715 : *uniPattern); 01716 uniPattern++; 01717 if (*uniPattern == '-') { 01718 uniPattern++; 01719 if (*uniPattern == 0) { 01720 return 0; 01721 } 01722 endChar = (nocase ? Tcl_UniCharToLower(*uniPattern) 01723 : *uniPattern); 01724 uniPattern++; 01725 if (((startChar <= ch1) && (ch1 <= endChar)) 01726 || ((endChar <= ch1) && (ch1 <= startChar))) { 01727 /* 01728 * Matches ranges of form [a-z] or [z-a]. 01729 */ 01730 break; 01731 } 01732 } else if (startChar == ch1) { 01733 break; 01734 } 01735 } 01736 while (*uniPattern != ']') { 01737 if (*uniPattern == 0) { 01738 uniPattern--; 01739 break; 01740 } 01741 uniPattern++; 01742 } 01743 uniPattern++; 01744 continue; 01745 } 01746 01747 /* 01748 * If the next pattern character is '\', just strip off the '\' so we 01749 * do exact matching on the character that follows. 01750 */ 01751 01752 if (p == '\\') { 01753 if (*(++uniPattern) == '\0') { 01754 return 0; 01755 } 01756 } 01757 01758 /* 01759 * There's no special character. Just make sure that the next bytes of 01760 * each string match. 01761 */ 01762 01763 if (nocase) { 01764 if (Tcl_UniCharToLower(*uniStr) != 01765 Tcl_UniCharToLower(*uniPattern)) { 01766 return 0; 01767 } 01768 } else if (*uniStr != *uniPattern) { 01769 return 0; 01770 } 01771 uniStr++; 01772 uniPattern++; 01773 } 01774 } 01775 01776 /* 01777 *---------------------------------------------------------------------- 01778 * 01779 * TclUniCharMatch -- 01780 * 01781 * See if a particular Unicode string matches a particular pattern. 01782 * Allows case insensitivity. This is the Unicode equivalent of the char* 01783 * Tcl_StringCaseMatch. This variant of Tcl_UniCharCaseMatch uses counted 01784 * Strings, so embedded NULLs are allowed. 01785 * 01786 * Results: 01787 * The return value is 1 if string matches pattern, and 0 otherwise. The 01788 * matching operation permits the following special characters in the 01789 * pattern: *?\[] (see the manual entry for details on what these mean). 01790 * 01791 * Side effects: 01792 * None. 01793 * 01794 *---------------------------------------------------------------------- 01795 */ 01796 01797 int 01798 TclUniCharMatch( 01799 CONST Tcl_UniChar *string, /* Unicode String. */ 01800 int strLen, /* Length of String */ 01801 CONST Tcl_UniChar *pattern, /* Pattern, which may contain special 01802 * characters. */ 01803 int ptnLen, /* Length of Pattern */ 01804 int nocase) /* 0 for case sensitive, 1 for insensitive */ 01805 { 01806 CONST Tcl_UniChar *stringEnd, *patternEnd; 01807 Tcl_UniChar p; 01808 01809 stringEnd = string + strLen; 01810 patternEnd = pattern + ptnLen; 01811 01812 while (1) { 01813 /* 01814 * See if we're at the end of both the pattern and the string. If so, 01815 * we succeeded. If we're at the end of the pattern but not at the end 01816 * of the string, we failed. 01817 */ 01818 01819 if (pattern == patternEnd) { 01820 return (string == stringEnd); 01821 } 01822 p = *pattern; 01823 if ((string == stringEnd) && (p != '*')) { 01824 return 0; 01825 } 01826 01827 /* 01828 * Check for a "*" as the next pattern character. It matches any 01829 * substring. We handle this by skipping all the characters up to the 01830 * next matching one in the pattern, and then calling ourselves 01831 * recursively for each postfix of string, until either we match or we 01832 * reach the end of the string. 01833 */ 01834 01835 if (p == '*') { 01836 /* 01837 * Skip all successive *'s in the pattern. 01838 */ 01839 01840 while (*(++pattern) == '*') { 01841 /* empty body */ 01842 } 01843 if (pattern == patternEnd) { 01844 return 1; 01845 } 01846 p = *pattern; 01847 if (nocase) { 01848 p = Tcl_UniCharToLower(p); 01849 } 01850 while (1) { 01851 /* 01852 * Optimization for matching - cruise through the string 01853 * quickly if the next char in the pattern isn't a special 01854 * character. 01855 */ 01856 01857 if ((p != '[') && (p != '?') && (p != '\\')) { 01858 if (nocase) { 01859 while ((string < stringEnd) && (p != *string) 01860 && (p != Tcl_UniCharToLower(*string))) { 01861 string++; 01862 } 01863 } else { 01864 while ((string < stringEnd) && (p != *string)) { 01865 string++; 01866 } 01867 } 01868 } 01869 if (TclUniCharMatch(string, stringEnd - string, 01870 pattern, patternEnd - pattern, nocase)) { 01871 return 1; 01872 } 01873 if (string == stringEnd) { 01874 return 0; 01875 } 01876 string++; 01877 } 01878 } 01879 01880 /* 01881 * Check for a "?" as the next pattern character. It matches any 01882 * single character. 01883 */ 01884 01885 if (p == '?') { 01886 pattern++; 01887 string++; 01888 continue; 01889 } 01890 01891 /* 01892 * Check for a "[" as the next pattern character. It is followed by a 01893 * list of characters that are acceptable, or by a range (two 01894 * characters separated by "-"). 01895 */ 01896 01897 if (p == '[') { 01898 Tcl_UniChar ch1, startChar, endChar; 01899 01900 pattern++; 01901 ch1 = (nocase ? Tcl_UniCharToLower(*string) : *string); 01902 string++; 01903 while (1) { 01904 if ((*pattern == ']') || (pattern == patternEnd)) { 01905 return 0; 01906 } 01907 startChar = (nocase ? Tcl_UniCharToLower(*pattern) : *pattern); 01908 pattern++; 01909 if (*pattern == '-') { 01910 pattern++; 01911 if (pattern == patternEnd) { 01912 return 0; 01913 } 01914 endChar = (nocase ? Tcl_UniCharToLower(*pattern) 01915 : *pattern); 01916 pattern++; 01917 if (((startChar <= ch1) && (ch1 <= endChar)) 01918 || ((endChar <= ch1) && (ch1 <= startChar))) { 01919 /* 01920 * Matches ranges of form [a-z] or [z-a]. 01921 */ 01922 break; 01923 } 01924 } else if (startChar == ch1) { 01925 break; 01926 } 01927 } 01928 while (*pattern != ']') { 01929 if (pattern == patternEnd) { 01930 pattern--; 01931 break; 01932 } 01933 pattern++; 01934 } 01935 pattern++; 01936 continue; 01937 } 01938 01939 /* 01940 * If the next pattern character is '\', just strip off the '\' so we 01941 * do exact matching on the character that follows. 01942 */ 01943 01944 if (p == '\\') { 01945 if (++pattern == patternEnd) { 01946 return 0; 01947 } 01948 } 01949 01950 /* 01951 * There's no special character. Just make sure that the next bytes of 01952 * each string match. 01953 */ 01954 01955 if (nocase) { 01956 if (Tcl_UniCharToLower(*string) != Tcl_UniCharToLower(*pattern)) { 01957 return 0; 01958 } 01959 } else if (*string != *pattern) { 01960 return 0; 01961 } 01962 string++; 01963 pattern++; 01964 } 01965 } 01966 01967 /* 01968 * Local Variables: 01969 * mode: c 01970 * c-basic-offset: 4 01971 * fill-column: 78 01972 * End: 01973 */
Generated on Wed Mar 12 12:18:23 2008 by 1.5.1 |