/home/ntakagi/work/tcl8.5.1/generic/tclUtf.c Source File

00001 /*
00002  * tclUtf.c --
00003  *
00004  *      Routines for manipulating UTF-8 strings.
00005  *
00006  * Copyright (c) 1997-1998 Sun Microsystems, Inc.
00007  *
00008  * See the file "license.terms" for information on usage and redistribution of
00009  * this file, and for a DISCLAIMER OF ALL WARRANTIES.
00010  *
00011  * RCS: @(#) $Id: tclUtf.c,v 1.37 2005/10/31 15:59:41 dkf Exp $
00012  */
00013 
00014 #include "tclInt.h"
00015 
00016 /*
00017  * Include the static character classification tables and macros.
00018  */
00019 
00020 #include "tclUniData.c"
00021 
00022 /*
00023  * The following macros are used for fast character category tests. The x_BITS
00024  * values are shifted right by the category value to determine whether the
00025  * given category is included in the set.
00026  */
00027 
00028 #define ALPHA_BITS ((1 << UPPERCASE_LETTER) | (1 << LOWERCASE_LETTER) \
00029         | (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1<<OTHER_LETTER))
00030 
00031 #define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER)
00032 
00033 #define SPACE_BITS ((1 << SPACE_SEPARATOR) | (1 << LINE_SEPARATOR) \
00034         | (1 << PARAGRAPH_SEPARATOR))
00035 
00036 #define CONNECTOR_BITS (1 << CONNECTOR_PUNCTUATION)
00037 
00038 #define PRINT_BITS (ALPHA_BITS | DIGIT_BITS | SPACE_BITS | \
00039         (1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \
00040         (1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \
00041         (1 << OTHER_NUMBER) | (1 << CONNECTOR_PUNCTUATION) | \
00042         (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \
00043         (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \
00044         (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION) | \
00045         (1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \
00046         (1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL))
00047 
00048 #define PUNCT_BITS ((1 << CONNECTOR_PUNCTUATION) | \
00049         (1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \
00050         (1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \
00051         (1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION))
00052 
00053 /*
00054  * Unicode characters less than this value are represented by themselves in
00055  * UTF-8 strings.
00056  */
00057 
00058 #define UNICODE_SELF    0x80
00059 
00060 /*
00061  * The following structures are used when mapping between Unicode (UCS-2) and
00062  * UTF-8.
00063  */
00064 
00065 static CONST unsigned char totalBytes[256] = {
00066     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
00067     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
00068     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
00069     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
00070     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
00071     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
00072     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
00073     3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
00074 #if TCL_UTF_MAX > 3
00075     4,4,4,4,4,4,4,4,
00076 #else
00077     1,1,1,1,1,1,1,1,
00078 #endif
00079 #if TCL_UTF_MAX > 4
00080     5,5,5,5,
00081 #else
00082     1,1,1,1,
00083 #endif
00084 #if TCL_UTF_MAX > 5
00085     6,6,6,6
00086 #else
00087     1,1,1,1
00088 #endif
00089 };
00090 
00091 /*
00092  * Functions used only in this module.
00093  */
00094 
00095 static int              UtfCount(int ch);
00096 
00097 /*
00098  *---------------------------------------------------------------------------
00099  *
00100  * UtfCount --
00101  *
00102  *      Find the number of bytes in the Utf character "ch".
00103  *
00104  * Results:
00105  *      The return values is the number of bytes in the Utf character "ch".
00106  *
00107  * Side effects:
00108  *      None.
00109  *
00110  *---------------------------------------------------------------------------
00111  */
00112 
00113 INLINE static int
00114 UtfCount(
00115     int ch)                     /* The Tcl_UniChar whose size is returned. */
00116 {
00117     if ((ch > 0) && (ch < UNICODE_SELF)) {
00118         return 1;
00119     }
00120     if (ch <= 0x7FF) {
00121         return 2;
00122     }
00123     if (ch <= 0xFFFF) {
00124         return 3;
00125     }
00126 #if TCL_UTF_MAX > 3
00127     if (ch <= 0x1FFFFF) {
00128         return 4;
00129     }
00130     if (ch <= 0x3FFFFFF) {
00131         return 5;
00132     }
00133     if (ch <= 0x7FFFFFFF) {
00134         return 6;
00135     }
00136 #endif
00137     return 3;
00138 }
00139 
00140 /*
00141  *---------------------------------------------------------------------------
00142  *
00143  * Tcl_UniCharToUtf --
00144  *
00145  *      Store the given Tcl_UniChar as a sequence of UTF-8 bytes in the
00146  *      provided buffer. Equivalent to Plan 9 runetochar().
00147  *
00148  * Results:
00149  *      The return values is the number of bytes in the buffer that were
00150  *      consumed.
00151  *
00152  * Side effects:
00153  *      None.
00154  *
00155  *---------------------------------------------------------------------------
00156  */
00157 
00158 INLINE int
00159 Tcl_UniCharToUtf(
00160     int ch,                     /* The Tcl_UniChar to be stored in the
00161                                  * buffer. */
00162     char *buf)                  /* Buffer in which the UTF-8 representation of
00163                                  * the Tcl_UniChar is stored. Buffer must be
00164                                  * large enough to hold the UTF-8 character
00165                                  * (at most TCL_UTF_MAX bytes). */
00166 {
00167     if ((ch > 0) && (ch < UNICODE_SELF)) {
00168         buf[0] = (char) ch;
00169         return 1;
00170     }
00171     if (ch >= 0) {
00172         if (ch <= 0x7FF) {
00173             buf[1] = (char) ((ch | 0x80) & 0xBF);
00174             buf[0] = (char) ((ch >> 6) | 0xC0);
00175             return 2;
00176         }
00177         if (ch <= 0xFFFF) {
00178         three:
00179             buf[2] = (char) ((ch | 0x80) & 0xBF);
00180             buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
00181             buf[0] = (char) ((ch >> 12) | 0xE0);
00182             return 3;
00183         }
00184 
00185 #if TCL_UTF_MAX > 3
00186         if (ch <= 0x1FFFFF) {
00187             buf[3] = (char) ((ch | 0x80) & 0xBF);
00188             buf[2] = (char) (((ch >> 6) | 0x80) & 0xBF);
00189             buf[1] = (char) (((ch >> 12) | 0x80) & 0xBF);
00190             buf[0] = (char) ((ch >> 18) | 0xF0);
00191             return 4;
00192         }
00193         if (ch <= 0x3FFFFFF) {
00194             buf[4] = (char) ((ch | 0x80) & 0xBF);
00195             buf[3] = (char) (((ch >> 6) | 0x80) & 0xBF);
00196             buf[2] = (char) (((ch >> 12) | 0x80) & 0xBF);
00197             buf[1] = (char) (((ch >> 18) | 0x80) & 0xBF);
00198             buf[0] = (char) ((ch >> 24) | 0xF8);
00199             return 5;
00200         }
00201         if (ch <= 0x7FFFFFFF) {
00202             buf[5] = (char) ((ch | 0x80) & 0xBF);
00203             buf[4] = (char) (((ch >> 6) | 0x80) & 0xBF);
00204             buf[3] = (char) (((ch >> 12) | 0x80) & 0xBF);
00205             buf[2] = (char) (((ch >> 18) | 0x80) & 0xBF);
00206             buf[1] = (char) (((ch >> 24) | 0x80) & 0xBF);
00207             buf[0] = (char) ((ch >> 30) | 0xFC);
00208             return 6;
00209         }
00210 #endif
00211     }
00212 
00213     ch = 0xFFFD;
00214     goto three;
00215 }
00216 
00217 /*
00218  *---------------------------------------------------------------------------
00219  *
00220  * Tcl_UniCharToUtfDString --
00221  *
00222  *      Convert the given Unicode string to UTF-8.
00223  *
00224  * Results:
00225  *      The return value is a pointer to the UTF-8 representation of the
00226  *      Unicode string. Storage for the return value is appended to the end of
00227  *      dsPtr.
00228  *
00229  * Side effects:
00230  *      None.
00231  *
00232  *---------------------------------------------------------------------------
00233  */
00234 
00235 char *
00236 Tcl_UniCharToUtfDString(
00237     CONST Tcl_UniChar *uniStr,  /* Unicode string to convert to UTF-8. */
00238     int uniLength,              /* Length of Unicode string in Tcl_UniChars
00239                                  * (must be >= 0). */
00240     Tcl_DString *dsPtr)         /* UTF-8 representation of string is appended
00241                                  * to this previously initialized DString. */
00242 {
00243     CONST Tcl_UniChar *w, *wEnd;
00244     char *p, *string;
00245     int oldLength;
00246 
00247     /*
00248      * UTF-8 string length in bytes will be <= Unicode string length *
00249      * TCL_UTF_MAX.
00250      */
00251 
00252     oldLength = Tcl_DStringLength(dsPtr);
00253     Tcl_DStringSetLength(dsPtr, (oldLength + uniLength + 1) * TCL_UTF_MAX);
00254     string = Tcl_DStringValue(dsPtr) + oldLength;
00255 
00256     p = string;
00257     wEnd = uniStr + uniLength;
00258     for (w = uniStr; w < wEnd; ) {
00259         p += Tcl_UniCharToUtf(*w, p);
00260         w++;
00261     }
00262     Tcl_DStringSetLength(dsPtr, oldLength + (p - string));
00263 
00264     return string;
00265 }
00266 
00267 /*
00268  *---------------------------------------------------------------------------
00269  *
00270  * Tcl_UtfToUniChar --
00271  *
00272  *      Extract the Tcl_UniChar represented by the UTF-8 string. Bad UTF-8
00273  *      sequences are converted to valid Tcl_UniChars and processing
00274  *      continues. Equivalent to Plan 9 chartorune().
00275  *
00276  *      The caller must ensure that the source buffer is long enough that this
00277  *      routine does not run off the end and dereference non-existent memory
00278  *      looking for trail bytes. If the source buffer is known to be '\0'
00279  *      terminated, this cannot happen. Otherwise, the caller should call
00280  *      Tcl_UtfCharComplete() before calling this routine to ensure that
00281  *      enough bytes remain in the string.
00282  *
00283  * Results:
00284  *      *chPtr is filled with the Tcl_UniChar, and the return value is the
00285  *      number of bytes from the UTF-8 string that were consumed.
00286  *
00287  * Side effects:
00288  *      None.
00289  *
00290  *---------------------------------------------------------------------------
00291  */
00292 
00293 int
00294 Tcl_UtfToUniChar(
00295     register CONST char *src,   /* The UTF-8 string. */
00296     register Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by
00297                                  * the UTF-8 string. */
00298 {
00299     register int byte;
00300 
00301     /*
00302      * Unroll 1 to 3 byte UTF-8 sequences, use loop to handle longer ones.
00303      */
00304 
00305     byte = *((unsigned char *) src);
00306     if (byte < 0xC0) {
00307         /*
00308          * Handles properly formed UTF-8 characters between 0x01 and 0x7F.
00309          * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid
00310          * characters representing themselves.
00311          */
00312 
00313         *chPtr = (Tcl_UniChar) byte;
00314         return 1;
00315     } else if (byte < 0xE0) {
00316         if ((src[1] & 0xC0) == 0x80) {
00317             /*
00318              * Two-byte-character lead-byte followed by a trail-byte.
00319              */
00320 
00321             *chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (src[1] & 0x3F));
00322             return 2;
00323         }
00324 
00325         /*
00326          * A two-byte-character lead-byte not followed by trail-byte
00327          * represents itself.
00328          */
00329 
00330         *chPtr = (Tcl_UniChar) byte;
00331         return 1;
00332     } else if (byte < 0xF0) {
00333         if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) {
00334             /*
00335              * Three-byte-character lead byte followed by two trail bytes.
00336              */
00337 
00338             *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12)
00339                     | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
00340             return 3;
00341         }
00342 
00343         /*
00344          * A three-byte-character lead-byte not followed by two trail-bytes
00345          * represents itself.
00346          */
00347 
00348         *chPtr = (Tcl_UniChar) byte;
00349         return 1;
00350     }
00351 #if TCL_UTF_MAX > 3
00352     {
00353         int ch, total, trail;
00354 
00355         total = totalBytes[byte];
00356         trail = total - 1;
00357         if (trail > 0) {
00358             ch = byte & (0x3F >> trail);
00359             do {
00360                 src++;
00361                 if ((*src & 0xC0) != 0x80) {
00362                     *chPtr = byte;
00363                     return 1;
00364                 }
00365                 ch <<= 6;
00366                 ch |= (*src & 0x3F);
00367                 trail--;
00368             } while (trail > 0);
00369             *chPtr = ch;
00370             return total;
00371         }
00372     }
00373 #endif
00374 
00375     *chPtr = (Tcl_UniChar) byte;
00376     return 1;
00377 }
00378 
00379 /*
00380  *---------------------------------------------------------------------------
00381  *
00382  * Tcl_UtfToUniCharDString --
00383  *
00384  *      Convert the UTF-8 string to Unicode.
00385  *
00386  * Results:
00387  *      The return value is a pointer to the Unicode representation of the
00388  *      UTF-8 string. Storage for the return value is appended to the end of
00389  *      dsPtr. The Unicode string is terminated with a Unicode NULL character.
00390  *
00391  * Side effects:
00392  *      None.
00393  *
00394  *---------------------------------------------------------------------------
00395  */
00396 
00397 Tcl_UniChar *
00398 Tcl_UtfToUniCharDString(
00399     CONST char *src,            /* UTF-8 string to convert to Unicode. */
00400     int length,                 /* Length of UTF-8 string in bytes, or -1 for
00401                                  * strlen(). */
00402     Tcl_DString *dsPtr)         /* Unicode representation of string is
00403                                  * appended to this previously initialized
00404                                  * DString. */
00405 {
00406     Tcl_UniChar *w, *wString;
00407     CONST char *p, *end;
00408     int oldLength;
00409 
00410     if (length < 0) {
00411         length = strlen(src);
00412     }
00413 
00414     /*
00415      * Unicode string length in Tcl_UniChars will be <= UTF-8 string length in
00416      * bytes.
00417      */
00418 
00419     oldLength = Tcl_DStringLength(dsPtr);
00420     Tcl_DStringSetLength(dsPtr,
00421             (int) ((oldLength + length + 1) * sizeof(Tcl_UniChar)));
00422     wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength);
00423 
00424     w = wString;
00425     end = src + length;
00426     for (p = src; p < end; ) {
00427         p += TclUtfToUniChar(p, w);
00428         w++;
00429     }
00430     *w = '\0';
00431     Tcl_DStringSetLength(dsPtr,
00432             (oldLength + ((char *) w - (char *) wString)));
00433 
00434     return wString;
00435 }
00436 
00437 /*
00438  *---------------------------------------------------------------------------
00439  *
00440  * Tcl_UtfCharComplete --
00441  *
00442  *      Determine if the UTF-8 string of the given length is long enough to be
00443  *      decoded by Tcl_UtfToUniChar(). This does not ensure that the UTF-8
00444  *      string is properly formed. Equivalent to Plan 9 fullrune().
00445  *
00446  * Results:
00447  *      The return value is 0 if the string is not long enough, non-zero
00448  *      otherwise.
00449  *
00450  * Side effects:
00451  *      None.
00452  *
00453  *---------------------------------------------------------------------------
00454  */
00455 
00456 int
00457 Tcl_UtfCharComplete(
00458     CONST char *src,            /* String to check if first few bytes contain
00459                                  * a complete UTF-8 character. */
00460     int length)                 /* Length of above string in bytes. */
00461 {
00462     int ch;
00463 
00464     ch = *((unsigned char *) src);
00465     return length >= totalBytes[ch];
00466 }
00467 
00468 /*
00469  *---------------------------------------------------------------------------
00470  *
00471  * Tcl_NumUtfChars --
00472  *
00473  *      Returns the number of characters (not bytes) in the UTF-8 string, not
00474  *      including the terminating NULL byte. This is equivalent to Plan 9
00475  *      utflen() and utfnlen().
00476  *
00477  * Results:
00478  *      As above.
00479  *
00480  * Side effects:
00481  *      None.
00482  *
00483  *---------------------------------------------------------------------------
00484  */
00485 
00486 int
00487 Tcl_NumUtfChars(
00488     register CONST char *src,   /* The UTF-8 string to measure. */
00489     int length)                 /* The length of the string in bytes, or -1
00490                                  * for strlen(string). */
00491 {
00492     Tcl_UniChar ch;
00493     register Tcl_UniChar *chPtr = &ch;
00494     register int i;
00495 
00496     /*
00497      * The separate implementations are faster.
00498      *
00499      * Since this is a time-sensitive function, we also do the check for the
00500      * single-byte char case specially.
00501      */
00502 
00503     i = 0;
00504     if (length < 0) {
00505         while (*src != '\0') {
00506             src += TclUtfToUniChar(src, chPtr);
00507             i++;
00508         }
00509     } else {
00510         register int n;
00511 
00512         while (length > 0) {
00513             if (UCHAR(*src) < 0xC0) {
00514                 length--;
00515                 src++;
00516             } else {
00517                 n = Tcl_UtfToUniChar(src, chPtr);
00518                 length -= n;
00519                 src += n;
00520             }
00521             i++;
00522         }
00523     }
00524     return i;
00525 }
00526 
00527 /*
00528  *---------------------------------------------------------------------------
00529  *
00530  * Tcl_UtfFindFirst --
00531  *
00532  *      Returns a pointer to the first occurance of the given Tcl_UniChar in
00533  *      the NULL-terminated UTF-8 string. The NULL terminator is considered
00534  *      part of the UTF-8 string. Equivalent to Plan 9 utfrune().
00535  *
00536  * Results:
00537  *      As above. If the Tcl_UniChar does not exist in the given string, the
00538  *      return value is NULL.
00539  *
00540  * Side effects:
00541  *      None.
00542  *
00543  *---------------------------------------------------------------------------
00544  */
00545 
00546 CONST char *
00547 Tcl_UtfFindFirst(
00548     CONST char *src,            /* The UTF-8 string to be searched. */
00549     int ch)                     /* The Tcl_UniChar to search for. */
00550 {
00551     int len;
00552     Tcl_UniChar find;
00553 
00554     while (1) {
00555         len = TclUtfToUniChar(src, &find);
00556         if (find == ch) {
00557             return src;
00558         }
00559         if (*src == '\0') {
00560             return NULL;
00561         }
00562         src += len;
00563     }
00564 }
00565 
00566 /*
00567  *---------------------------------------------------------------------------
00568  *
00569  * Tcl_UtfFindLast --
00570  *
00571  *      Returns a pointer to the last occurance of the given Tcl_UniChar in
00572  *      the NULL-terminated UTF-8 string. The NULL terminator is considered
00573  *      part of the UTF-8 string. Equivalent to Plan 9 utfrrune().
00574  *
00575  * Results:
00576  *      As above. If the Tcl_UniChar does not exist in the given string, the
00577  *      return value is NULL.
00578  *
00579  * Side effects:
00580  *      None.
00581  *
00582  *---------------------------------------------------------------------------
00583  */
00584 
00585 CONST char *
00586 Tcl_UtfFindLast(
00587     CONST char *src,            /* The UTF-8 string to be searched. */
00588     int ch)                     /* The Tcl_UniChar to search for. */
00589 {
00590     int len;
00591     Tcl_UniChar find;
00592     CONST char *last;
00593 
00594     last = NULL;
00595     while (1) {
00596         len = TclUtfToUniChar(src, &find);
00597         if (find == ch) {
00598             last = src;
00599         }
00600         if (*src == '\0') {
00601             break;
00602         }
00603         src += len;
00604     }
00605     return last;
00606 }
00607 
00608 /*
00609  *---------------------------------------------------------------------------
00610  *
00611  * Tcl_UtfNext --
00612  *
00613  *      Given a pointer to some current location in a UTF-8 string, move
00614  *      forward one character. The caller must ensure that they are not asking
00615  *      for the next character after the last character in the string.
00616  *
00617  * Results:
00618  *      The return value is the pointer to the next character in the UTF-8
00619  *      string.
00620  *
00621  * Side effects:
00622  *      None.
00623  *
00624  *---------------------------------------------------------------------------
00625  */
00626 
00627 CONST char *
00628 Tcl_UtfNext(
00629     CONST char *src)            /* The current location in the string. */
00630 {
00631     Tcl_UniChar ch;
00632 
00633     return src + TclUtfToUniChar(src, &ch);
00634 }
00635 
00636 /*
00637  *---------------------------------------------------------------------------
00638  *
00639  * Tcl_UtfPrev --
00640  *
00641  *      Given a pointer to some current location in a UTF-8 string, move
00642  *      backwards one character. This works correctly when the pointer is in
00643  *      the middle of a UTF-8 character.
00644  *
00645  * Results:
00646  *      The return value is a pointer to the previous character in the UTF-8
00647  *      string. If the current location was already at the beginning of the
00648  *      string, the return value will also be a pointer to the beginning of
00649  *      the string.
00650  *
00651  * Side effects:
00652  *      None.
00653  *
00654  *---------------------------------------------------------------------------
00655  */
00656 
00657 CONST char *
00658 Tcl_UtfPrev(
00659     CONST char *src,            /* The current location in the string. */
00660     CONST char *start)          /* Pointer to the beginning of the string, to
00661                                  * avoid going backwards too far. */
00662 {
00663     CONST char *look;
00664     int i, byte;
00665 
00666     src--;
00667     look = src;
00668     for (i = 0; i < TCL_UTF_MAX; i++) {
00669         if (look < start) {
00670             if (src < start) {
00671                 src = start;
00672             }
00673             break;
00674         }
00675         byte = *((unsigned char *) look);
00676         if (byte < 0x80) {
00677             break;
00678         }
00679         if (byte >= 0xC0) {
00680             return look;
00681         }
00682         look--;
00683     }
00684     return src;
00685 }
00686 
00687 /*
00688  *---------------------------------------------------------------------------
00689  *
00690  * Tcl_UniCharAtIndex --
00691  *
00692  *      Returns the Unicode character represented at the specified character
00693  *      (not byte) position in the UTF-8 string.
00694  *
00695  * Results:
00696  *      As above.
00697  *
00698  * Side effects:
00699  *      None.
00700  *
00701  *---------------------------------------------------------------------------
00702  */
00703 
00704 Tcl_UniChar
00705 Tcl_UniCharAtIndex(
00706     register CONST char *src,   /* The UTF-8 string to dereference. */
00707     register int index)         /* The position of the desired character. */
00708 {
00709     Tcl_UniChar ch;
00710 
00711     while (index >= 0) {
00712         index--;
00713         src += TclUtfToUniChar(src, &ch);
00714     }
00715     return ch;
00716 }
00717 
00718 /*
00719  *---------------------------------------------------------------------------
00720  *
00721  * Tcl_UtfAtIndex --
00722  *
00723  *      Returns a pointer to the specified character (not byte) position in
00724  *      the UTF-8 string.
00725  *
00726  * Results:
00727  *      As above.
00728  *
00729  * Side effects:
00730  *      None.
00731  *
00732  *---------------------------------------------------------------------------
00733  */
00734 
00735 CONST char *
00736 Tcl_UtfAtIndex(
00737     register CONST char *src,   /* The UTF-8 string. */
00738     register int index)         /* The position of the desired character. */
00739 {
00740     Tcl_UniChar ch;
00741 
00742     while (index > 0) {
00743         index--;
00744         src += TclUtfToUniChar(src, &ch);
00745     }
00746     return src;
00747 }
00748 
00749 /*
00750  *---------------------------------------------------------------------------
00751  *
00752  * Tcl_UtfBackslash --
00753  *
00754  *      Figure out how to handle a backslash sequence.
00755  *
00756  * Results:
00757  *      Stores the bytes represented by the backslash sequence in dst and
00758  *      returns the number of bytes written to dst. At most TCL_UTF_MAX bytes
00759  *      are written to dst; dst must have been large enough to accept those
00760  *      bytes. If readPtr isn't NULL then it is filled in with a count of the
00761  *      number of bytes in the backslash sequence.
00762  *
00763  * Side effects:
00764  *      The maximum number of bytes it takes to represent a Unicode character
00765  *      in UTF-8 is guaranteed to be less than the number of bytes used to
00766  *      express the backslash sequence that represents that Unicode character.
00767  *      If the target buffer into which the caller is going to store the bytes
00768  *      that represent the Unicode character is at least as large as the
00769  *      source buffer from which the backslashed sequence was extracted, no
00770  *      buffer overruns should occur.
00771  *
00772  *---------------------------------------------------------------------------
00773  */
00774 
00775 int
00776 Tcl_UtfBackslash(
00777     CONST char *src,            /* Points to the backslash character of a
00778                                  * backslash sequence. */
00779     int *readPtr,               /* Fill in with number of characters read from
00780                                  * src, unless NULL. */
00781     char *dst)                  /* Filled with the bytes represented by the
00782                                  * backslash sequence. */
00783 {
00784 #define LINE_LENGTH 128
00785     int numRead;
00786     int result;
00787 
00788     result = TclParseBackslash(src, LINE_LENGTH, &numRead, dst);
00789     if (numRead == LINE_LENGTH) {
00790         /*
00791          * We ate a whole line. Pay the price of a strlen()
00792          */
00793 
00794         result = TclParseBackslash(src, (int)strlen(src), &numRead, dst);
00795     }
00796     if (readPtr != NULL) {
00797         *readPtr = numRead;
00798     }
00799     return result;
00800 }
00801 
00802 /*
00803  *----------------------------------------------------------------------
00804  *
00805  * Tcl_UtfToUpper --
00806  *
00807  *      Convert lowercase characters to uppercase characters in a UTF string
00808  *      in place. The conversion may shrink the UTF string.
00809  *
00810  * Results:
00811  *      Returns the number of bytes in the resulting string excluding the
00812  *      trailing null.
00813  *
00814  * Side effects:
00815  *      Writes a terminating null after the last converted character.
00816  *
00817  *----------------------------------------------------------------------
00818  */
00819 
00820 int
00821 Tcl_UtfToUpper(
00822     char *str)                  /* String to convert in place. */
00823 {
00824     Tcl_UniChar ch, upChar;
00825     char *src, *dst;
00826     int bytes;
00827 
00828     /*
00829      * Iterate over the string until we hit the terminating null.
00830      */
00831 
00832     src = dst = str;
00833     while (*src) {
00834         bytes = TclUtfToUniChar(src, &ch);
00835         upChar = Tcl_UniCharToUpper(ch);
00836 
00837         /*
00838          * To keep badly formed Utf strings from getting inflated by the
00839          * conversion (thereby causing a segfault), only copy the upper case
00840          * char to dst if its size is <= the original char.
00841          */
00842 
00843         if (bytes < UtfCount(upChar)) {
00844             memcpy(dst, src, (size_t) bytes);
00845             dst += bytes;
00846         } else {
00847             dst += Tcl_UniCharToUtf(upChar, dst);
00848         }
00849         src += bytes;
00850     }
00851     *dst = '\0';
00852     return (dst - str);
00853 }
00854 
00855 /*
00856  *----------------------------------------------------------------------
00857  *
00858  * Tcl_UtfToLower --
00859  *
00860  *      Convert uppercase characters to lowercase characters in a UTF string
00861  *      in place. The conversion may shrink the UTF string.
00862  *
00863  * Results:
00864  *      Returns the number of bytes in the resulting string excluding the
00865  *      trailing null.
00866  *
00867  * Side effects:
00868  *      Writes a terminating null after the last converted character.
00869  *
00870  *----------------------------------------------------------------------
00871  */
00872 
00873 int
00874 Tcl_UtfToLower(
00875     char *str)                  /* String to convert in place. */
00876 {
00877     Tcl_UniChar ch, lowChar;
00878     char *src, *dst;
00879     int bytes;
00880 
00881     /*
00882      * Iterate over the string until we hit the terminating null.
00883      */
00884 
00885     src = dst = str;
00886     while (*src) {
00887         bytes = TclUtfToUniChar(src, &ch);
00888         lowChar = Tcl_UniCharToLower(ch);
00889 
00890         /*
00891          * To keep badly formed Utf strings from getting inflated by the
00892          * conversion (thereby causing a segfault), only copy the lower case
00893          * char to dst if its size is <= the original char.
00894          */
00895 
00896         if (bytes < UtfCount(lowChar)) {
00897             memcpy(dst, src, (size_t) bytes);
00898             dst += bytes;
00899         } else {
00900             dst += Tcl_UniCharToUtf(lowChar, dst);
00901         }
00902         src += bytes;
00903     }
00904     *dst = '\0';
00905     return (dst - str);
00906 }
00907 
00908 /*
00909  *----------------------------------------------------------------------
00910  *
00911  * Tcl_UtfToTitle --
00912  *
00913  *      Changes the first character of a UTF string to title case or uppercase
00914  *      and the rest of the string to lowercase. The conversion happens in
00915  *      place and may shrink the UTF string.
00916  *
00917  * Results:
00918  *      Returns the number of bytes in the resulting string excluding the
00919  *      trailing null.
00920  *
00921  * Side effects:
00922  *      Writes a terminating null after the last converted character.
00923  *
00924  *----------------------------------------------------------------------
00925  */
00926 
00927 int
00928 Tcl_UtfToTitle(
00929     char *str)                  /* String to convert in place. */
00930 {
00931     Tcl_UniChar ch, titleChar, lowChar;
00932     char *src, *dst;
00933     int bytes;
00934 
00935     /*
00936      * Capitalize the first character and then lowercase the rest of the
00937      * characters until we get to a null.
00938      */
00939 
00940     src = dst = str;
00941 
00942     if (*src) {
00943         bytes = TclUtfToUniChar(src, &ch);
00944         titleChar = Tcl_UniCharToTitle(ch);
00945 
00946         if (bytes < UtfCount(titleChar)) {
00947             memcpy(dst, src, (size_t) bytes);
00948             dst += bytes;
00949         } else {
00950             dst += Tcl_UniCharToUtf(titleChar, dst);
00951         }
00952         src += bytes;
00953     }
00954     while (*src) {
00955         bytes = TclUtfToUniChar(src, &ch);
00956         lowChar = Tcl_UniCharToLower(ch);
00957 
00958         if (bytes < UtfCount(lowChar)) {
00959             memcpy(dst, src, (size_t) bytes);
00960             dst += bytes;
00961         } else {
00962             dst += Tcl_UniCharToUtf(lowChar, dst);
00963         }
00964         src += bytes;
00965     }
00966     *dst = '\0';
00967     return (dst - str);
00968 }
00969 
00970 /*
00971  *----------------------------------------------------------------------
00972  *
00973  * TclpUtfNcmp2 --
00974  *
00975  *      Compare at most numBytes bytes of utf-8 strings cs and ct. Both cs and
00976  *      ct are assumed to be at least numBytes bytes long.
00977  *
00978  * Results:
00979  *      Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
00980  *
00981  * Side effects:
00982  *      None.
00983  *
00984  *----------------------------------------------------------------------
00985  */
00986 
00987 int
00988 TclpUtfNcmp2(
00989     CONST char *cs,             /* UTF string to compare to ct. */
00990     CONST char *ct,             /* UTF string cs is compared to. */
00991     unsigned long numBytes)     /* Number of *bytes* to compare. */
00992 {
00993     /*
00994      * We can't simply call 'memcmp(cs, ct, numBytes);' because we need to
00995      * check for Tcl's \xC0\x80 non-utf-8 null encoding. Otherwise utf-8 lexes
00996      * fine in the strcmp manner.
00997      */
00998 
00999     register int result = 0;
01000 
01001     for ( ; numBytes != 0; numBytes--, cs++, ct++) {
01002         if (*cs != *ct) {
01003             result = UCHAR(*cs) - UCHAR(*ct);
01004             break;
01005         }
01006     }
01007     if (numBytes && ((UCHAR(*cs) == 0xC0) || (UCHAR(*ct) == 0xC0))) {
01008         unsigned char c1, c2;
01009 
01010         c1 = ((UCHAR(*cs) == 0xC0) && (UCHAR(cs[1]) == 0x80)) ? 0 : UCHAR(*cs);
01011         c2 = ((UCHAR(*ct) == 0xC0) && (UCHAR(ct[1]) == 0x80)) ? 0 : UCHAR(*ct);
01012         result = (c1 - c2);
01013     }
01014     return result;
01015 }
01016 
01017 /*
01018  *----------------------------------------------------------------------
01019  *
01020  * Tcl_UtfNcmp --
01021  *
01022  *      Compare at most numChars UTF chars of string cs to string ct. Both cs
01023  *      and ct are assumed to be at least numChars UTF chars long.
01024  *
01025  * Results:
01026  *      Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
01027  *
01028  * Side effects:
01029  *      None.
01030  *
01031  *----------------------------------------------------------------------
01032  */
01033 
01034 int
01035 Tcl_UtfNcmp(
01036     CONST char *cs,             /* UTF string to compare to ct. */
01037     CONST char *ct,             /* UTF string cs is compared to. */
01038     unsigned long numChars)     /* Number of UTF chars to compare. */
01039 {
01040     Tcl_UniChar ch1, ch2;
01041 
01042     /*
01043      * Cannot use 'memcmp(cs, ct, n);' as byte representation of \u0000 (the
01044      * pair of bytes 0xc0,0x80) is larger than byte representation of \u0001
01045      * (the byte 0x01.)
01046      */
01047 
01048     while (numChars-- > 0) {
01049         /*
01050          * n must be interpreted as chars, not bytes. This should be called
01051          * only when both strings are of at least n chars long (no need for \0
01052          * check)
01053          */
01054 
01055         cs += TclUtfToUniChar(cs, &ch1);
01056         ct += TclUtfToUniChar(ct, &ch2);
01057         if (ch1 != ch2) {
01058             return (ch1 - ch2);
01059         }
01060     }
01061     return 0;
01062 }
01063 
01064 /*
01065  *----------------------------------------------------------------------
01066  *
01067  * Tcl_UtfNcasecmp --
01068  *
01069  *      Compare at most numChars UTF chars of string cs to string ct case
01070  *      insensitive. Both cs and ct are assumed to be at least numChars UTF
01071  *      chars long.
01072  *
01073  * Results:
01074  *      Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
01075  *
01076  * Side effects:
01077  *      None.
01078  *
01079  *----------------------------------------------------------------------
01080  */
01081 
01082 int
01083 Tcl_UtfNcasecmp(
01084     CONST char *cs,             /* UTF string to compare to ct. */
01085     CONST char *ct,             /* UTF string cs is compared to. */
01086     unsigned long numChars)     /* Number of UTF chars to compare. */
01087 {
01088     Tcl_UniChar ch1, ch2;
01089     while (numChars-- > 0) {
01090         /*
01091          * n must be interpreted as chars, not bytes.
01092          * This should be called only when both strings are of
01093          * at least n chars long (no need for \0 check)
01094          */
01095         cs += TclUtfToUniChar(cs, &ch1);
01096         ct += TclUtfToUniChar(ct, &ch2);
01097         if (ch1 != ch2) {
01098             ch1 = Tcl_UniCharToLower(ch1);
01099             ch2 = Tcl_UniCharToLower(ch2);
01100             if (ch1 != ch2) {
01101                 return (ch1 - ch2);
01102             }
01103         }
01104     }
01105     return 0;
01106 }
01107 
01108 /*
01109  *----------------------------------------------------------------------
01110  *
01111  * Tcl_UniCharToUpper --
01112  *
01113  *      Compute the uppercase equivalent of the given Unicode character.
01114  *
01115  * Results:
01116  *      Returns the uppercase Unicode character.
01117  *
01118  * Side effects:
01119  *      None.
01120  *
01121  *----------------------------------------------------------------------
01122  */
01123 
01124 Tcl_UniChar
01125 Tcl_UniCharToUpper(
01126     int ch)                     /* Unicode character to convert. */
01127 {
01128     int info = GetUniCharInfo(ch);
01129 
01130     if (GetCaseType(info) & 0x04) {
01131         return (Tcl_UniChar) (ch - GetDelta(info));
01132     } else {
01133         return ch;
01134     }
01135 }
01136 
01137 /*
01138  *----------------------------------------------------------------------
01139  *
01140  * Tcl_UniCharToLower --
01141  *
01142  *      Compute the lowercase equivalent of the given Unicode character.
01143  *
01144  * Results:
01145  *      Returns the lowercase Unicode character.
01146  *
01147  * Side effects:
01148  *      None.
01149  *
01150  *----------------------------------------------------------------------
01151  */
01152 
01153 Tcl_UniChar
01154 Tcl_UniCharToLower(
01155     int ch)                     /* Unicode character to convert. */
01156 {
01157     int info = GetUniCharInfo(ch);
01158 
01159     if (GetCaseType(info) & 0x02) {
01160         return (Tcl_UniChar) (ch + GetDelta(info));
01161     } else {
01162         return ch;
01163     }
01164 }
01165 
01166 /*
01167  *----------------------------------------------------------------------
01168  *
01169  * Tcl_UniCharToTitle --
01170  *
01171  *      Compute the titlecase equivalent of the given Unicode character.
01172  *
01173  * Results:
01174  *      Returns the titlecase Unicode character.
01175  *
01176  * Side effects:
01177  *      None.
01178  *
01179  *----------------------------------------------------------------------
01180  */
01181 
01182 Tcl_UniChar
01183 Tcl_UniCharToTitle(
01184     int ch)                     /* Unicode character to convert. */
01185 {
01186     int info = GetUniCharInfo(ch);
01187     int mode = GetCaseType(info);
01188 
01189     if (mode & 0x1) {
01190         /*
01191          * Subtract or add one depending on the original case.
01192          */
01193 
01194         return (Tcl_UniChar) (ch + ((mode & 0x4) ? -1 : 1));
01195     } else if (mode == 0x4) {
01196         return (Tcl_UniChar) (ch - GetDelta(info));
01197     } else {
01198         return ch;
01199     }
01200 }
01201 
01202 /*
01203  *----------------------------------------------------------------------
01204  *
01205  * Tcl_UniCharLen --
01206  *
01207  *      Find the length of a UniChar string. The str input must be null
01208  *      terminated.
01209  *
01210  * Results:
01211  *      Returns the length of str in UniChars (not bytes).
01212  *
01213  * Side effects:
01214  *      None.
01215  *
01216  *----------------------------------------------------------------------
01217  */
01218 
01219 int
01220 Tcl_UniCharLen(
01221     CONST Tcl_UniChar *uniStr)  /* Unicode string to find length of. */
01222 {
01223     int len = 0;
01224 
01225     while (*uniStr != '\0') {
01226         len++;
01227         uniStr++;
01228     }
01229     return len;
01230 }
01231 
01232 /*
01233  *----------------------------------------------------------------------
01234  *
01235  * Tcl_UniCharNcmp --
01236  *
01237  *      Compare at most numChars unichars of string ucs to string uct.
01238  *      Both ucs and uct are assumed to be at least numChars unichars long.
01239  *
01240  * Results:
01241  *      Return <0 if ucs < uct, 0 if ucs == uct, or >0 if ucs > uct.
01242  *
01243  * Side effects:
01244  *      None.
01245  *
01246  *----------------------------------------------------------------------
01247  */
01248 
01249 int
01250 Tcl_UniCharNcmp(
01251     CONST Tcl_UniChar *ucs,     /* Unicode string to compare to uct. */
01252     CONST Tcl_UniChar *uct,     /* Unicode string ucs is compared to. */
01253     unsigned long numChars)     /* Number of unichars to compare. */
01254 {
01255 #ifdef WORDS_BIGENDIAN
01256     /*
01257      * We are definitely on a big-endian machine; memcmp() is safe
01258      */
01259 
01260     return memcmp(ucs, uct, numChars*sizeof(Tcl_UniChar));
01261 
01262 #else /* !WORDS_BIGENDIAN */
01263     /*
01264      * We can't simply call memcmp() because that is not lexically correct.
01265      */
01266 
01267     for ( ; numChars != 0; ucs++, uct++, numChars--) {
01268         if (*ucs != *uct) {
01269             return (*ucs - *uct);
01270         }
01271     }
01272     return 0;
01273 #endif /* WORDS_BIGENDIAN */
01274 }
01275 
01276 /*
01277  *----------------------------------------------------------------------
01278  *
01279  * Tcl_UniCharNcasecmp --
01280  *
01281  *      Compare at most numChars unichars of string ucs to string uct case
01282  *      insensitive. Both ucs and uct are assumed to be at least numChars
01283  *      unichars long.
01284  *
01285  * Results:
01286  *      Return <0 if ucs < uct, 0 if ucs == uct, or >0 if ucs > uct.
01287  *
01288  * Side effects:
01289  *      None.
01290  *
01291  *----------------------------------------------------------------------
01292  */
01293 
01294 int
01295 Tcl_UniCharNcasecmp(
01296     CONST Tcl_UniChar *ucs,     /* Unicode string to compare to uct. */
01297     CONST Tcl_UniChar *uct,     /* Unicode string ucs is compared to. */
01298     unsigned long numChars)     /* Number of unichars to compare. */
01299 {
01300     for ( ; numChars != 0; numChars--, ucs++, uct++) {
01301         if (*ucs != *uct) {
01302             Tcl_UniChar lcs = Tcl_UniCharToLower(*ucs);
01303             Tcl_UniChar lct = Tcl_UniCharToLower(*uct);
01304 
01305             if (lcs != lct) {
01306                 return (lcs - lct);
01307             }
01308         }
01309     }
01310     return 0;
01311 }
01312 
01313 /*
01314  *----------------------------------------------------------------------
01315  *
01316  * Tcl_UniCharIsAlnum --
01317  *
01318  *      Test if a character is an alphanumeric Unicode character.
01319  *
01320  * Results:
01321  *      Returns 1 if character is alphanumeric.
01322  *
01323  * Side effects:
01324  *      None.
01325  *
01326  *----------------------------------------------------------------------
01327  */
01328 
01329 int
01330 Tcl_UniCharIsAlnum(
01331     int ch)                     /* Unicode character to test. */
01332 {
01333     register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
01334 
01335     return (((ALPHA_BITS | DIGIT_BITS) >> category) & 1);
01336 }
01337 
01338 /*
01339  *----------------------------------------------------------------------
01340  *
01341  * Tcl_UniCharIsAlpha --
01342  *
01343  *      Test if a character is an alphabetic Unicode character.
01344  *
01345  * Results:
01346  *      Returns 1 if character is alphabetic.
01347  *
01348  * Side effects:
01349  *      None.
01350  *
01351  *----------------------------------------------------------------------
01352  */
01353 
01354 int
01355 Tcl_UniCharIsAlpha(
01356     int ch)                     /* Unicode character to test. */
01357 {
01358     register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
01359     return ((ALPHA_BITS >> category) & 1);
01360 }
01361 
01362 /*
01363  *----------------------------------------------------------------------
01364  *
01365  * Tcl_UniCharIsControl --
01366  *
01367  *      Test if a character is a Unicode control character.
01368  *
01369  * Results:
01370  *      Returns non-zero if character is a control.
01371  *
01372  * Side effects:
01373  *      None.
01374  *
01375  *----------------------------------------------------------------------
01376  */
01377 
01378 int
01379 Tcl_UniCharIsControl(
01380     int ch)                     /* Unicode character to test. */
01381 {
01382     return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == CONTROL);
01383 }
01384 
01385 /*
01386  *----------------------------------------------------------------------
01387  *
01388  * Tcl_UniCharIsDigit --
01389  *
01390  *      Test if a character is a numeric Unicode character.
01391  *
01392  * Results:
01393  *      Returns non-zero if character is a digit.
01394  *
01395  * Side effects:
01396  *      None.
01397  *
01398  *----------------------------------------------------------------------
01399  */
01400 
01401 int
01402 Tcl_UniCharIsDigit(
01403     int ch)                     /* Unicode character to test. */
01404 {
01405     return (GetUniCharInfo(ch)&UNICODE_CATEGORY_MASK) == DECIMAL_DIGIT_NUMBER;
01406 }
01407 
01408 /*
01409  *----------------------------------------------------------------------
01410  *
01411  * Tcl_UniCharIsGraph --
01412  *
01413  *      Test if a character is any Unicode print character except space.
01414  *
01415  * Results:
01416  *      Returns non-zero if character is printable, but not space.
01417  *
01418  * Side effects:
01419  *      None.
01420  *
01421  *----------------------------------------------------------------------
01422  */
01423 
01424 int
01425 Tcl_UniCharIsGraph(
01426     int ch)                     /* Unicode character to test. */
01427 {
01428     register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
01429     return (((PRINT_BITS >> category) & 1) && ((unsigned char) ch != ' '));
01430 }
01431 
01432 /*
01433  *----------------------------------------------------------------------
01434  *
01435  * Tcl_UniCharIsLower --
01436  *
01437  *      Test if a character is a lowercase Unicode character.
01438  *
01439  * Results:
01440  *      Returns non-zero if character is lowercase.
01441  *
01442  * Side effects:
01443  *      None.
01444  *
01445  *----------------------------------------------------------------------
01446  */
01447 
01448 int
01449 Tcl_UniCharIsLower(
01450     int ch)                     /* Unicode character to test. */
01451 {
01452     return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == LOWERCASE_LETTER);
01453 }
01454 
01455 /*
01456  *----------------------------------------------------------------------
01457  *
01458  * Tcl_UniCharIsPrint --
01459  *
01460  *      Test if a character is a Unicode print character.
01461  *
01462  * Results:
01463  *      Returns non-zero if character is printable.
01464  *
01465  * Side effects:
01466  *      None.
01467  *
01468  *----------------------------------------------------------------------
01469  */
01470 
01471 int
01472 Tcl_UniCharIsPrint(
01473     int ch)                     /* Unicode character to test. */
01474 {
01475     register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
01476     return ((PRINT_BITS >> category) & 1);
01477 }
01478 
01479 /*
01480  *----------------------------------------------------------------------
01481  *
01482  * Tcl_UniCharIsPunct --
01483  *
01484  *      Test if a character is a Unicode punctuation character.
01485  *
01486  * Results:
01487  *      Returns non-zero if character is punct.
01488  *
01489  * Side effects:
01490  *      None.
01491  *
01492  *----------------------------------------------------------------------
01493  */
01494 
01495 int
01496 Tcl_UniCharIsPunct(
01497     int ch)                     /* Unicode character to test. */
01498 {
01499     register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
01500     return ((PUNCT_BITS >> category) & 1);
01501 }
01502 
01503 /*
01504  *----------------------------------------------------------------------
01505  *
01506  * Tcl_UniCharIsSpace --
01507  *
01508  *      Test if a character is a whitespace Unicode character.
01509  *
01510  * Results:
01511  *      Returns non-zero if character is a space.
01512  *
01513  * Side effects:
01514  *      None.
01515  *
01516  *----------------------------------------------------------------------
01517  */
01518 
01519 int
01520 Tcl_UniCharIsSpace(
01521     int ch)                     /* Unicode character to test. */
01522 {
01523     register int category;
01524 
01525     /*
01526      * If the character is within the first 127 characters, just use the
01527      * standard C function, otherwise consult the Unicode table.
01528      */
01529 
01530     if (ch < 0x80) {
01531         return isspace(UCHAR(ch)); /* INTL: ISO space */
01532     } else {
01533         category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
01534         return ((SPACE_BITS >> category) & 1);
01535     }
01536 }
01537 
01538 /*
01539  *----------------------------------------------------------------------
01540  *
01541  * Tcl_UniCharIsUpper --
01542  *
01543  *      Test if a character is a uppercase Unicode character.
01544  *
01545  * Results:
01546  *      Returns non-zero if character is uppercase.
01547  *
01548  * Side effects:
01549  *      None.
01550  *
01551  *----------------------------------------------------------------------
01552  */
01553 
01554 int
01555 Tcl_UniCharIsUpper(
01556     int ch)                     /* Unicode character to test. */
01557 {
01558     return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == UPPERCASE_LETTER);
01559 }
01560 
01561 /*
01562  *----------------------------------------------------------------------
01563  *
01564  * Tcl_UniCharIsWordChar --
01565  *
01566  *      Test if a character is alphanumeric or a connector punctuation mark.
01567  *
01568  * Results:
01569  *      Returns 1 if character is a word character.
01570  *
01571  * Side effects:
01572  *      None.
01573  *
01574  *----------------------------------------------------------------------
01575  */
01576 
01577 int
01578 Tcl_UniCharIsWordChar(
01579     int ch)                     /* Unicode character to test. */
01580 {
01581     register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
01582 
01583     return (((ALPHA_BITS | DIGIT_BITS | CONNECTOR_BITS) >> category) & 1);
01584 }
01585 
01586 /*
01587  *----------------------------------------------------------------------
01588  *
01589  * Tcl_UniCharCaseMatch --
01590  *
01591  *      See if a particular Unicode string matches a particular pattern.
01592  *      Allows case insensitivity. This is the Unicode equivalent of the char*
01593  *      Tcl_StringCaseMatch. The UniChar strings must be NULL-terminated.
01594  *      This has no provision for counted UniChar strings, thus should not be
01595  *      used where NULLs are expected in the UniChar string. Use
01596  *      TclUniCharMatch where possible.
01597  *
01598  * Results:
01599  *      The return value is 1 if string matches pattern, and 0 otherwise. The
01600  *      matching operation permits the following special characters in the
01601  *      pattern: *?\[] (see the manual entry for details on what these mean).
01602  *
01603  * Side effects:
01604  *      None.
01605  *
01606  *----------------------------------------------------------------------
01607  */
01608 
01609 int
01610 Tcl_UniCharCaseMatch(
01611     CONST Tcl_UniChar *uniStr,  /* Unicode String. */
01612     CONST Tcl_UniChar *uniPattern,
01613                                 /* Pattern, which may contain special
01614                                  * characters. */
01615     int nocase)                 /* 0 for case sensitive, 1 for insensitive */
01616 {
01617     Tcl_UniChar ch1, p;
01618 
01619     while (1) {
01620         p = *uniPattern;
01621 
01622         /*
01623          * See if we're at the end of both the pattern and the string. If so,
01624          * we succeeded. If we're at the end of the pattern but not at the end
01625          * of the string, we failed.
01626          */
01627 
01628         if (p == 0) {
01629             return (*uniStr == 0);
01630         }
01631         if ((*uniStr == 0) && (p != '*')) {
01632             return 0;
01633         }
01634 
01635         /*
01636          * Check for a "*" as the next pattern character. It matches any
01637          * substring. We handle this by skipping all the characters up to the
01638          * next matching one in the pattern, and then calling ourselves
01639          * recursively for each postfix of string, until either we match or we
01640          * reach the end of the string.
01641          */
01642 
01643         if (p == '*') {
01644             /*
01645              * Skip all successive *'s in the pattern
01646              */
01647 
01648             while (*(++uniPattern) == '*') {
01649                 /* empty body */
01650             }
01651             p = *uniPattern;
01652             if (p == 0) {
01653                 return 1;
01654             }
01655             if (nocase) {
01656                 p = Tcl_UniCharToLower(p);
01657             }
01658             while (1) {
01659                 /*
01660                  * Optimization for matching - cruise through the string
01661                  * quickly if the next char in the pattern isn't a special
01662                  * character
01663                  */
01664 
01665                 if ((p != '[') && (p != '?') && (p != '\\')) {
01666                     if (nocase) {
01667                         while (*uniStr && (p != *uniStr)
01668                                 && (p != Tcl_UniCharToLower(*uniStr))) {
01669                             uniStr++;
01670                         }
01671                     } else {
01672                         while (*uniStr && (p != *uniStr)) {
01673                             uniStr++;
01674                         }
01675                     }
01676                 }
01677                 if (Tcl_UniCharCaseMatch(uniStr, uniPattern, nocase)) {
01678                     return 1;
01679                 }
01680                 if (*uniStr == 0) {
01681                     return 0;
01682                 }
01683                 uniStr++;
01684             }
01685         }
01686 
01687         /*
01688          * Check for a "?" as the next pattern character. It matches any
01689          * single character.
01690          */
01691 
01692         if (p == '?') {
01693             uniPattern++;
01694             uniStr++;
01695             continue;
01696         }
01697 
01698         /*
01699          * Check for a "[" as the next pattern character. It is followed by a
01700          * list of characters that are acceptable, or by a range (two
01701          * characters separated by "-").
01702          */
01703 
01704         if (p == '[') {
01705             Tcl_UniChar startChar, endChar;
01706 
01707             uniPattern++;
01708             ch1 = (nocase ? Tcl_UniCharToLower(*uniStr) : *uniStr);
01709             uniStr++;
01710             while (1) {
01711                 if ((*uniPattern == ']') || (*uniPattern == 0)) {
01712                     return 0;
01713                 }
01714                 startChar = (nocase ? Tcl_UniCharToLower(*uniPattern)
01715                         : *uniPattern);
01716                 uniPattern++;
01717                 if (*uniPattern == '-') {
01718                     uniPattern++;
01719                     if (*uniPattern == 0) {
01720                         return 0;
01721                     }
01722                     endChar = (nocase ? Tcl_UniCharToLower(*uniPattern)
01723                             : *uniPattern);
01724                     uniPattern++;
01725                     if (((startChar <= ch1) && (ch1 <= endChar))
01726                             || ((endChar <= ch1) && (ch1 <= startChar))) {
01727                         /*
01728                          * Matches ranges of form [a-z] or [z-a].
01729                          */
01730                         break;
01731                     }
01732                 } else if (startChar == ch1) {
01733                     break;
01734                 }
01735             }
01736             while (*uniPattern != ']') {
01737                 if (*uniPattern == 0) {
01738                     uniPattern--;
01739                     break;
01740                 }
01741                 uniPattern++;
01742             }
01743             uniPattern++;
01744             continue;
01745         }
01746 
01747         /*
01748          * If the next pattern character is '\', just strip off the '\' so we
01749          * do exact matching on the character that follows.
01750          */
01751 
01752         if (p == '\\') {
01753             if (*(++uniPattern) == '\0') {
01754                 return 0;
01755             }
01756         }
01757 
01758         /*
01759          * There's no special character. Just make sure that the next bytes of
01760          * each string match.
01761          */
01762 
01763         if (nocase) {
01764             if (Tcl_UniCharToLower(*uniStr) !=
01765                     Tcl_UniCharToLower(*uniPattern)) {
01766                 return 0;
01767             }
01768         } else if (*uniStr != *uniPattern) {
01769             return 0;
01770         }
01771         uniStr++;
01772         uniPattern++;
01773     }
01774 }
01775 
01776 /*
01777  *----------------------------------------------------------------------
01778  *
01779  * TclUniCharMatch --
01780  *
01781  *      See if a particular Unicode string matches a particular pattern.
01782  *      Allows case insensitivity. This is the Unicode equivalent of the char*
01783  *      Tcl_StringCaseMatch. This variant of Tcl_UniCharCaseMatch uses counted
01784  *      Strings, so embedded NULLs are allowed.
01785  *
01786  * Results:
01787  *      The return value is 1 if string matches pattern, and 0 otherwise. The
01788  *      matching operation permits the following special characters in the
01789  *      pattern: *?\[] (see the manual entry for details on what these mean).
01790  *
01791  * Side effects:
01792  *      None.
01793  *
01794  *----------------------------------------------------------------------
01795  */
01796 
01797 int
01798 TclUniCharMatch(
01799     CONST Tcl_UniChar *string,  /* Unicode String. */
01800     int strLen,                 /* Length of String */
01801     CONST Tcl_UniChar *pattern, /* Pattern, which may contain special
01802                                  * characters. */
01803     int ptnLen,                 /* Length of Pattern */
01804     int nocase)                 /* 0 for case sensitive, 1 for insensitive */
01805 {
01806     CONST Tcl_UniChar *stringEnd, *patternEnd;
01807     Tcl_UniChar p;
01808 
01809     stringEnd = string + strLen;
01810     patternEnd = pattern + ptnLen;
01811 
01812     while (1) {
01813         /*
01814          * See if we're at the end of both the pattern and the string. If so,
01815          * we succeeded. If we're at the end of the pattern but not at the end
01816          * of the string, we failed.
01817          */
01818 
01819         if (pattern == patternEnd) {
01820             return (string == stringEnd);
01821         }
01822         p = *pattern;
01823         if ((string == stringEnd) && (p != '*')) {
01824             return 0;
01825         }
01826 
01827         /*
01828          * Check for a "*" as the next pattern character. It matches any
01829          * substring. We handle this by skipping all the characters up to the
01830          * next matching one in the pattern, and then calling ourselves
01831          * recursively for each postfix of string, until either we match or we
01832          * reach the end of the string.
01833          */
01834 
01835         if (p == '*') {
01836             /*
01837              * Skip all successive *'s in the pattern.
01838              */
01839 
01840             while (*(++pattern) == '*') {
01841                 /* empty body */
01842             }
01843             if (pattern == patternEnd) {
01844                 return 1;
01845             }
01846             p = *pattern;
01847             if (nocase) {
01848                 p = Tcl_UniCharToLower(p);
01849             }
01850             while (1) {
01851                 /*
01852                  * Optimization for matching - cruise through the string
01853                  * quickly if the next char in the pattern isn't a special
01854                  * character.
01855                  */
01856 
01857                 if ((p != '[') && (p != '?') && (p != '\\')) {
01858                     if (nocase) {
01859                         while ((string < stringEnd) && (p != *string)
01860                                 && (p != Tcl_UniCharToLower(*string))) {
01861                             string++;
01862                         }
01863                     } else {
01864                         while ((string < stringEnd) && (p != *string)) {
01865                             string++;
01866                         }
01867                     }
01868                 }
01869                 if (TclUniCharMatch(string, stringEnd - string,
01870                         pattern, patternEnd - pattern, nocase)) {
01871                     return 1;
01872                 }
01873                 if (string == stringEnd) {
01874                     return 0;
01875                 }
01876                 string++;
01877             }
01878         }
01879 
01880         /*
01881          * Check for a "?" as the next pattern character. It matches any
01882          * single character.
01883          */
01884 
01885         if (p == '?') {
01886             pattern++;
01887             string++;
01888             continue;
01889         }
01890 
01891         /*
01892          * Check for a "[" as the next pattern character. It is followed by a
01893          * list of characters that are acceptable, or by a range (two
01894          * characters separated by "-").
01895          */
01896 
01897         if (p == '[') {
01898             Tcl_UniChar ch1, startChar, endChar;
01899 
01900             pattern++;
01901             ch1 = (nocase ? Tcl_UniCharToLower(*string) : *string);
01902             string++;
01903             while (1) {
01904                 if ((*pattern == ']') || (pattern == patternEnd)) {
01905                     return 0;
01906                 }
01907                 startChar = (nocase ? Tcl_UniCharToLower(*pattern) : *pattern);
01908                 pattern++;
01909                 if (*pattern == '-') {
01910                     pattern++;
01911                     if (pattern == patternEnd) {
01912                         return 0;
01913                     }
01914                     endChar = (nocase ? Tcl_UniCharToLower(*pattern)
01915                             : *pattern);
01916                     pattern++;
01917                     if (((startChar <= ch1) && (ch1 <= endChar))
01918                             || ((endChar <= ch1) && (ch1 <= startChar))) {
01919                         /*
01920                          * Matches ranges of form [a-z] or [z-a].
01921                          */
01922                         break;
01923                     }
01924                 } else if (startChar == ch1) {
01925                     break;
01926                 }
01927             }
01928             while (*pattern != ']') {
01929                 if (pattern == patternEnd) {
01930                     pattern--;
01931                     break;
01932                 }
01933                 pattern++;
01934             }
01935             pattern++;
01936             continue;
01937         }
01938 
01939         /*
01940          * If the next pattern character is '\', just strip off the '\' so we
01941          * do exact matching on the character that follows.
01942          */
01943 
01944         if (p == '\\') {
01945             if (++pattern == patternEnd) {
01946                 return 0;
01947             }
01948         }
01949 
01950         /*
01951          * There's no special character. Just make sure that the next bytes of
01952          * each string match.
01953          */
01954 
01955         if (nocase) {
01956             if (Tcl_UniCharToLower(*string) != Tcl_UniCharToLower(*pattern)) {
01957                 return 0;
01958             }
01959         } else if (*string != *pattern) {
01960             return 0;
01961         }
01962         string++;
01963         pattern++;
01964     }
01965 }
01966 
01967 /*
01968  * Local Variables:
01969  * mode: c
01970  * c-basic-offset: 4
01971  * fill-column: 78
01972  * End:
01973  */