regguts.h

Go to the documentation of this file.
00001 /*
00002  * Internal interface definitions, etc., for the reg package
00003  *
00004  * Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved.
00005  *
00006  * Development of this software was funded, in part, by Cray Research Inc.,
00007  * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
00008  * Corporation, none of whom are responsible for the results.  The author
00009  * thanks all of them.
00010  *
00011  * Redistribution and use in source and binary forms -- with or without
00012  * modification -- are permitted for any purpose, provided that
00013  * redistributions in source form retain this entire copyright notice and
00014  * indicate the origin and nature of any modifications.
00015  *
00016  * I'd appreciate being given credit for this package in the documentation of
00017  * software which uses it, but that is not a requirement.
00018  *
00019  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
00020  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
00021  * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
00022  * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
00023  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
00024  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
00025  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
00026  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
00027  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
00028  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00029  */
00030 
00031 /*
00032  * Environmental customization. It should not (I hope) be necessary to alter
00033  * the file you are now reading -- regcustom.h should handle it all, given
00034  * care here and elsewhere.
00035  */
00036 #include "regcustom.h"
00037 
00038 /*
00039  * Things that regcustom.h might override.
00040  */
00041 
00042 /* standard header files (NULL is a reasonable indicator for them) */
00043 #ifndef NULL
00044 #include <stdio.h>
00045 #include <stdlib.h>
00046 #include <ctype.h>
00047 #include <limits.h>
00048 #include <string.h>
00049 #endif
00050 
00051 /* assertions */
00052 #ifndef assert
00053 #ifndef REG_DEBUG
00054 #ifndef NDEBUG
00055 #define NDEBUG          /* no assertions */
00056 #endif
00057 #endif /* !REG_DEBUG */
00058 #include <assert.h>
00059 #endif
00060 
00061 /* voids */
00062 #ifndef VOID
00063 #define VOID    void            /* for function return values */
00064 #endif
00065 #ifndef DISCARD
00066 #define DISCARD void            /* for throwing values away */
00067 #endif
00068 #ifndef PVOID
00069 #define PVOID   void *          /* generic pointer */
00070 #endif
00071 #ifndef VS
00072 #define VS(x)   ((void*)(x))    /* cast something to generic ptr */
00073 #endif
00074 #ifndef NOPARMS
00075 #define NOPARMS void            /* for empty parm lists */
00076 #endif
00077 
00078 /* const */
00079 #ifndef CONST
00080 #define CONST   const           /* for old compilers, might be empty */
00081 #endif
00082 
00083 /* function-pointer declarator */
00084 #ifndef FUNCPTR
00085 #if __STDC__ >= 1
00086 #define FUNCPTR(name, args)     (*name)args
00087 #else
00088 #define FUNCPTR(name, args)     (*name)()
00089 #endif
00090 #endif
00091 
00092 /* memory allocation */
00093 #ifndef MALLOC
00094 #define MALLOC(n)       malloc(n)
00095 #endif
00096 #ifndef REALLOC
00097 #define REALLOC(p, n)   realloc(VS(p), n)
00098 #endif
00099 #ifndef FREE
00100 #define FREE(p)         free(VS(p))
00101 #endif
00102 
00103 /* want size of a char in bits, and max value in bounded quantifiers */
00104 #ifndef CHAR_BIT
00105 #include <limits.h>
00106 #endif
00107 #ifndef _POSIX2_RE_DUP_MAX
00108 #define _POSIX2_RE_DUP_MAX 255  /* normally from <limits.h> */
00109 #endif
00110 
00111 /*
00112  * misc
00113  */
00114 
00115 #define NOTREACHED      0
00116 #define xxx             1
00117 
00118 #define DUPMAX  _POSIX2_RE_DUP_MAX
00119 #define INFINITY        (DUPMAX+1)
00120 
00121 #define REMAGIC 0xfed7          /* magic number for main struct */
00122 
00123 /*
00124  * debugging facilities
00125  */
00126 #ifdef REG_DEBUG
00127 /* FDEBUG does finite-state tracing */
00128 #define FDEBUG(arglist) { if (v->eflags&REG_FTRACE) printf arglist; }
00129 /* MDEBUG does higher-level tracing */
00130 #define MDEBUG(arglist) { if (v->eflags&REG_MTRACE) printf arglist; }
00131 #else
00132 #define FDEBUG(arglist) {}
00133 #define MDEBUG(arglist) {}
00134 #endif
00135 
00136 /*
00137  * bitmap manipulation
00138  */
00139 #define UBITS   (CHAR_BIT * sizeof(unsigned))
00140 #define BSET(uv, sn)    ((uv)[(sn)/UBITS] |= (unsigned)1 << ((sn)%UBITS))
00141 #define ISBSET(uv, sn)  ((uv)[(sn)/UBITS] & ((unsigned)1 << ((sn)%UBITS)))
00142 
00143 /*
00144  * We dissect a chr into byts for colormap table indexing. Here we define a
00145  * byt, which will be the same as a byte on most machines... The exact size of
00146  * a byt is not critical, but about 8 bits is good, and extraction of 8-bit
00147  * chunks is sometimes especially fast.
00148  */
00149 
00150 #ifndef BYTBITS
00151 #define BYTBITS 8               /* bits in a byt */
00152 #endif
00153 #define BYTTAB  (1<<BYTBITS)    /* size of table with one entry per byt value */
00154 #define BYTMASK (BYTTAB-1)      /* bit mask for byt */
00155 #define NBYTS   ((CHRBITS+BYTBITS-1)/BYTBITS)
00156 /* the definition of GETCOLOR(), below, assumes NBYTS <= 4 */
00157 
00158 /*
00159  * As soon as possible, we map chrs into equivalence classes -- "colors" --
00160  * which are of much more manageable number.
00161  */
00162 
00163 typedef short color;            /* colors of characters */
00164 typedef int pcolor;             /* what color promotes to */
00165 #define COLORLESS       (-1)    /* impossible color */
00166 #define WHITE           0       /* default color, parent of all others */
00167 
00168 /*
00169  * A colormap is a tree -- more precisely, a DAG -- indexed at each level by a
00170  * byt of the chr, to map the chr to a color efficiently. Because lower
00171  * sections of the tree can be shared, it can exploit the usual sparseness of
00172  * such a mapping table. The tree is always NBYTS levels deep (in the past it
00173  * was shallower during construction but was "filled" to full depth at the end
00174  * of that); areas that are unaltered as yet point to "fill blocks" which are
00175  * entirely WHITE in color.
00176  */
00177 
00178 /* the tree itself */
00179 struct colors {
00180     color ccolor[BYTTAB];
00181 };
00182 struct ptrs {
00183     union tree *pptr[BYTTAB];
00184 };
00185 union tree {
00186     struct colors colors;
00187     struct ptrs ptrs;
00188 };
00189 #define tcolor  colors.ccolor
00190 #define tptr    ptrs.pptr
00191 
00192 /* Internal per-color descriptor structure for the color machinery */
00193 struct colordesc {
00194     uchr nchrs;                 /* number of chars of this color */
00195     color sub;                  /* open subcolor (if any); free chain ptr */
00196 #define NOSUB   COLORLESS
00197     struct arc *arcs;           /* color chain */
00198     int flags;
00199 #define FREECOL 01              /* currently free */
00200 #define PSEUDO  02              /* pseudocolor, no real chars */
00201 #define UNUSEDCOLOR(cd) ((cd)->flags&FREECOL)
00202     union tree *block;          /* block of solid color, if any */
00203 };
00204 
00205 /* the color map itself */
00206 struct colormap {
00207     int magic;
00208 #define CMMAGIC 0x876
00209     struct vars *v;             /* for compile error reporting */
00210     size_t ncds;                /* number of colordescs */
00211     size_t max;                 /* highest in use */
00212     color free;                 /* beginning of free chain (if non-0) */
00213     struct colordesc *cd;
00214 #define CDEND(cm)       (&(cm)->cd[(cm)->max + 1])
00215 #define NINLINECDS      ((size_t)10)
00216     struct colordesc cdspace[NINLINECDS];
00217     union tree tree[NBYTS];     /* tree top, plus fill blocks */
00218 };
00219 
00220 /* optimization magic to do fast chr->color mapping */
00221 #define B0(c)   ((c) & BYTMASK)
00222 #define B1(c)   (((c)>>BYTBITS) & BYTMASK)
00223 #define B2(c)   (((c)>>(2*BYTBITS)) & BYTMASK)
00224 #define B3(c)   (((c)>>(3*BYTBITS)) & BYTMASK)
00225 #if NBYTS == 1
00226 #define GETCOLOR(cm, c) ((cm)->tree->tcolor[B0(c)])
00227 #endif
00228 /* beware, for NBYTS>1, GETCOLOR() is unsafe -- 2nd arg used repeatedly */
00229 #if NBYTS == 2
00230 #define GETCOLOR(cm, c) ((cm)->tree->tptr[B1(c)]->tcolor[B0(c)])
00231 #endif
00232 #if NBYTS == 4
00233 #define GETCOLOR(cm, c) ((cm)->tree->tptr[B3(c)]->tptr[B2(c)]->tptr[B1(c)]->tcolor[B0(c)])
00234 #endif
00235 
00236 /*
00237  * Interface definitions for locale-interface functions in locale.c.
00238  */
00239 
00240 /* Representation of a set of characters. */
00241 struct cvec {
00242     int nchrs;                  /* number of chrs */
00243     int chrspace;               /* number of chrs possible */
00244     chr *chrs;                  /* pointer to vector of chrs */
00245     int nranges;                /* number of ranges (chr pairs) */
00246     int rangespace;             /* number of chrs possible */
00247     chr *ranges;                /* pointer to vector of chr pairs */
00248 };
00249 
00250 /*
00251  * definitions for non-deterministic finite autmaton (NFA) internal
00252  * representation
00253  *
00254  * Having a "from" pointer within each arc may seem redundant, but it saves a
00255  * lot of hassle.
00256  */
00257 
00258 struct state;
00259 
00260 struct arc {
00261     int type;
00262 #define ARCFREE '\0'
00263     color co;
00264     struct state *from;         /* where it's from (and contained within) */
00265     struct state *to;           /* where it's to */
00266     struct arc *outchain;       /* *from's outs chain or free chain */
00267 #define freechain       outchain
00268     struct arc *inchain;        /* *to's ins chain */
00269     struct arc *colorchain;     /* color's arc chain */
00270     struct arc *colorchainRev;  /* back-link in color's arc chain */
00271 };
00272 
00273 struct arcbatch {               /* for bulk allocation of arcs */
00274     struct arcbatch *next;
00275 #define ABSIZE  10
00276     struct arc a[ABSIZE];
00277 };
00278 
00279 struct state {
00280     int no;
00281 #define FREESTATE       (-1)
00282     char flag;                  /* marks special states */
00283     int nins;                   /* number of inarcs */
00284     struct arc *ins;            /* chain of inarcs */
00285     int nouts;                  /* number of outarcs */
00286     struct arc *outs;           /* chain of outarcs */
00287     struct arc *free;           /* chain of free arcs */
00288     struct state *tmp;          /* temporary for traversal algorithms */
00289     struct state *next;         /* chain for traversing all */
00290     struct state *prev;         /* back chain */
00291     struct arcbatch oas;        /* first arcbatch, avoid malloc in easy case */
00292     int noas;                   /* number of arcs used in first arcbatch */
00293 };
00294 
00295 struct nfa {
00296     struct state *pre;          /* pre-initial state */
00297     struct state *init;         /* initial state */
00298     struct state *final;        /* final state */
00299     struct state *post;         /* post-final state */
00300     int nstates;                /* for numbering states */
00301     struct state *states;       /* state-chain header */
00302     struct state *slast;        /* tail of the chain */
00303     struct state *free;         /* free list */
00304     struct colormap *cm;        /* the color map */
00305     color bos[2];               /* colors, if any, assigned to BOS and BOL */
00306     color eos[2];               /* colors, if any, assigned to EOS and EOL */
00307     size_t size;                /* Current NFA size; differs from nstates as
00308                                  * it also counts the number of states created
00309                                  * by children of this state. */
00310     struct vars *v;             /* simplifies compile error reporting */
00311     struct nfa *parent;         /* parent NFA, if any */
00312 };
00313 
00314 /*
00315  * definitions for compacted NFA
00316  */
00317 
00318 struct carc {
00319     color co;                   /* COLORLESS is list terminator */
00320     int to;                     /* state number */
00321 };
00322 
00323 struct cnfa {
00324     int nstates;                /* number of states */
00325     int ncolors;                /* number of colors */
00326     int flags;
00327 #define HASLACONS       01      /* uses lookahead constraints */
00328     int pre;                    /* setup state number */
00329     int post;                   /* teardown state number */
00330     color bos[2];               /* colors, if any, assigned to BOS and BOL */
00331     color eos[2];               /* colors, if any, assigned to EOS and EOL */
00332     struct carc **states;       /* vector of pointers to outarc lists */
00333     struct carc *arcs;          /* the area for the lists */
00334 };
00335 #define ZAPCNFA(cnfa)   ((cnfa).nstates = 0)
00336 #define NULLCNFA(cnfa)  ((cnfa).nstates == 0)
00337 
00338 /*
00339  * Used to limit the maximum NFA size to something sane. [Bug 1810264]
00340  */
00341 
00342 #ifndef REG_MAX_STATES
00343 #   define REG_MAX_STATES       100000
00344 #endif
00345 
00346 /*
00347  * subexpression tree
00348  */
00349 
00350 struct subre {
00351     char op;                    /* '|', '.' (concat), 'b' (backref), '(',
00352                                  * '=' */
00353     char flags;
00354 #define LONGER  01              /* prefers longer match */
00355 #define SHORTER 02              /* prefers shorter match */
00356 #define MIXED   04              /* mixed preference below */
00357 #define CAP     010             /* capturing parens below */
00358 #define BACKR   020             /* back reference below */
00359 #define INUSE   0100            /* in use in final tree */
00360 #define LOCAL   03              /* bits which may not propagate up */
00361 #define LMIX(f) ((f)<<2)        /* LONGER -> MIXED */
00362 #define SMIX(f) ((f)<<1)        /* SHORTER -> MIXED */
00363 #define UP(f)   (((f)&~LOCAL) | (LMIX(f) & SMIX(f) & MIXED))
00364 #define MESSY(f)        ((f)&(MIXED|CAP|BACKR))
00365 #define PREF(f) ((f)&LOCAL)
00366 #define PREF2(f1, f2)   ((PREF(f1) != 0) ? PREF(f1) : PREF(f2))
00367 #define COMBINE(f1, f2) (UP((f1)|(f2)) | PREF2(f1, f2))
00368     short retry;                /* index into retry memory */
00369     int subno;                  /* subexpression number (for 'b' and '(') */
00370     short min;                  /* min repetitions, for backref only */
00371     short max;                  /* max repetitions, for backref only */
00372     struct subre *left;         /* left child, if any (also freelist chain) */
00373     struct subre *right;        /* right child, if any */
00374     struct state *begin;        /* outarcs from here... */
00375     struct state *end;          /* ...ending in inarcs here */
00376     struct cnfa cnfa;           /* compacted NFA, if any */
00377     struct subre *chain;        /* for bookkeeping and error cleanup */
00378 };
00379 
00380 /*
00381  * table of function pointers for generic manipulation functions. A regex_t's
00382  * re_fns points to one of these.
00383  */
00384 
00385 struct fns {
00386     VOID FUNCPTR(free, (regex_t *));
00387 };
00388 
00389 /*
00390  * the insides of a regex_t, hidden behind a void *
00391  */
00392 
00393 struct guts {
00394     int magic;
00395 #define GUTSMAGIC       0xfed9
00396     int cflags;                 /* copy of compile flags */
00397     long info;                  /* copy of re_info */
00398     size_t nsub;                /* copy of re_nsub */
00399     struct subre *tree;
00400     struct cnfa search;         /* for fast preliminary search */
00401     int ntree;
00402     struct colormap cmap;
00403     int FUNCPTR(compare, (CONST chr *, CONST chr *, size_t));
00404     struct subre *lacons;       /* lookahead-constraint vector */
00405     int nlacons;                /* size of lacons */
00406 };
00407 
00408 /*
00409  * Magic for allocating a variable workspace. This default version is
00410  * stack-hungry.
00411  */
00412 
00413 #ifndef AllocVars
00414 #define AllocVars(vPtr) \
00415     struct vars var; \
00416     register struct vars *vPtr = &var
00417 #endif
00418 #ifndef FreeVars
00419 #define FreeVars(vPtr) ((void) 0)
00420 #endif
00421 
00422 /*
00423  * Local Variables:
00424  * mode: c
00425  * c-basic-offset: 4
00426  * fill-column: 78
00427  * End:
00428  */



Generated on Wed Mar 12 12:18:10 2008 by  doxygen 1.5.1