C:/Users/Dennis/src/lang/Life_start/Life/life-1.02/source/regexp/regexp.c

Go to the documentation of this file.
00001 /*
00002  * regcomp and regexec -- regsub and regerror are elsewhere
00003  * @(#)regexp.c 1.3 of 18 April 87
00004  *
00005  *      Copyright (c) 1986 by University of Toronto.
00006  *      Written by Henry Spencer.  Not derived from licensed software.
00007  *
00008  *      Permission is granted to anyone to use this software for any
00009  *      purpose on any computer system, and to redistribute it freely,
00010  *      subject to the following restrictions:
00011  *
00012  *      1. The author is not responsible for the consequences of use of
00013  *              this software, no matter how awful, even if they arise
00014  *              from defects in it.
00015  *
00016  *      2. The origin of this software must not be misrepresented, either
00017  *              by explicit claim or by omission.
00018  *
00019  *      3. Altered versions must be plainly marked as such, and must not
00020  *              be misrepresented as being the original software.
00021  *
00022  * Beware that some of this code is subtly aware of the way operator
00023  * precedence is structured in regular expressions.  Serious changes in
00024  * regular-expression syntax might require a total rethink.
00025  */
00026 #include <stdio.h>
00027 #include <string.h>
00028 #include "regexp.h"
00029 #include "regmagic.h"
00030 
00031 /*
00032  * The "internal use only" fields in regexp.h are present to pass info from
00033  * compile to execute that permits the execute phase to run lots faster on
00034  * simple cases.  They are:
00035  *
00036  * regstart     char that must begin a match; '\0' if none obvious
00037  * reganch      is the match anchored (at beginning-of-line only)?
00038  * regmust      string (pointer into program) that match must include, or NULL
00039  * regmlen      length of regmust string
00040  *
00041  * Regstart and reganch permit very fast decisions on suitable starting points
00042  * for a match, cutting down the work a lot.  Regmust permits fast rejection
00043  * of lines that cannot possibly match.  The regmust tests are costly enough
00044  * that regcomp() supplies a regmust only if the r.e. contains something
00045  * potentially expensive (at present, the only such thing detected is * or +
00046  * at the start of the r.e., which can involve a lot of backup).  Regmlen is
00047  * supplied because the test in regexec() needs it and regcomp() is computing
00048  * it anyway.
00049  */
00050 
00051 /*
00052  * Structure for regexp "program".  This is essentially a linear encoding
00053  * of a nondeterministic finite-state machine (aka syntax charts or
00054  * "railroad normal form" in parsing technology).  Each node is an opcode
00055  * plus a "next" pointer, possibly plus an operand.  "Next" pointers of
00056  * all nodes except BRANCH implement concatenation; a "next" pointer with
00057  * a BRANCH on both ends of it is connecting two alternatives.  (Here we
00058  * have one of the subtle syntax dependencies:  an individual BRANCH (as
00059  * opposed to a collection of them) is never concatenated with anything
00060  * because of operator precedence.)  The operand of some types of node is
00061  * a literal string; for others, it is a node leading into a sub-FSM.  In
00062  * particular, the operand of a BRANCH node is the first node of the branch.
00063  * (NB this is *not* a tree structure:  the tail of the branch connects
00064  * to the thing following the set of BRANCHes.)  The opcodes are:
00065  */
00066 
00067 /* definition   number  opnd?   meaning */
00068 #define END     0       /* no   End of program. */
00069 #define BOL     1       /* no   Match "" at beginning of line. */
00070 #define EOL     2       /* no   Match "" at end of line. */
00071 #define ANY     3       /* no   Match any one character. */
00072 #define ANYOF   4       /* str  Match any character in this string. */
00073 #define ANYBUT  5       /* str  Match any character not in this string. */
00074 #define BRANCH  6       /* node Match this alternative, or the next... */
00075 #define BACK    7       /* no   Match "", "next" ptr points backward. */
00076 #define EXACTLY 8       /* str  Match this string. */
00077 #define NOTHING 9       /* no   Match empty string. */
00078 #define STAR    10      /* node Match this (simple) thing 0 or more times. */
00079 #define PLUS    11      /* node Match this (simple) thing 1 or more times. */
00080 #define OPEN    20      /* no   Mark this point in input as start of #n. */
00081                         /*      OPEN+1 is number 1, etc. */
00082 #define CLOSE   30      /* no   Analogous to OPEN. */
00083 
00084 /*
00085  * Opcode notes:
00086  *
00087  * BRANCH       The set of branches constituting a single choice are hooked
00088  *              together with their "next" pointers, since precedence prevents
00089  *              anything being concatenated to any individual branch.  The
00090  *              "next" pointer of the last BRANCH in a choice points to the
00091  *              thing following the whole choice.  This is also where the
00092  *              final "next" pointer of each individual branch points; each
00093  *              branch starts with the operand node of a BRANCH node.
00094  *
00095  * BACK         Normal "next" pointers all implicitly point forward; BACK
00096  *              exists to make loop structures possible.
00097  *
00098  * STAR,PLUS    '?', and complex '*' and '+', are implemented as circular
00099  *              BRANCH structures using BACK.  Simple cases (one character
00100  *              per match) are implemented with STAR and PLUS for speed
00101  *              and to minimize recursive plunges.
00102  *
00103  * OPEN,CLOSE   ...are numbered at compile time.
00104  */
00105 
00106 /*
00107  * A node is one char of opcode followed by two chars of "next" pointer.
00108  * "Next" pointers are stored as two 8-bit pieces, high order first.  The
00109  * value is a positive offset from the opcode of the node containing it.
00110  * An operand, if any, simply follows the node.  (Note that much of the
00111  * code generation knows about this implicit relationship.)
00112  *
00113  * Using two bytes for the "next" pointer is vast overkill for most things,
00114  * but allows patterns to get big without disasters.
00115  */
00116 #define OP(p)   (*(p))
00117 #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
00118 #define OPERAND(p)      ((p) + 3)
00119 
00120 /*
00121  * See regmagic.h for one further detail of program structure.
00122  */
00123 
00124 
00125 /*
00126  * Utility definitions.
00127  */
00128 #ifndef CHARBITS
00129 #define UCHARAT(p)      ((int)*(unsigned char *)(p))
00130 #else
00131 #define UCHARAT(p)      ((int)*(p)&CHARBITS)
00132 #endif
00133 
00134 #define FAIL(m) { regerror(m); return(NULL); }
00135 #define ISMULT(c)       ((c) == '*' || (c) == '+' || (c) == '?')
00136 #define META    "^$.[()|?+*\\"
00137 
00138 /*
00139  * Flags to be passed up and down.
00140  */
00141 #define HASWIDTH        01      /* Known never to match null string. */
00142 #define SIMPLE          02      /* Simple enough to be STAR/PLUS operand. */
00143 #define SPSTART         04      /* Starts with * or +. */
00144 #define WORST           0       /* Worst case. */
00145 
00146 /*
00147  * Global work variables for regcomp().
00148  */
00149 static char *regparse;          /* Input-scan pointer. */
00150 static int regnpar;             /* () count. */
00151 static char regdummy;
00152 static char *regcode;           /* Code-emit pointer; &regdummy = don't. */
00153 static long regsize;            /* Code size. */
00154 
00155 /*
00156  * Forward declarations for regcomp()'s friends.
00157  */
00158 #ifndef STATIC
00159 #define STATIC  static
00160 #endif
00161 STATIC char *reg();
00162 STATIC char *regbranch();
00163 STATIC char *regpiece();
00164 STATIC char *regatom();
00165 STATIC char *regnode();
00166 STATIC char *regnext();
00167 STATIC void regc();
00168 STATIC void reginsert();
00169 STATIC void regtail();
00170 STATIC void regoptail();
00171 #ifdef STRCSPN
00172 STATIC int strcspn();
00173 #endif
00174 
00175 /*
00176  - regcomp - compile a regular expression into internal code
00177  *
00178  * We can't allocate space until we know how big the compiled form will be,
00179  * but we can't compile it (and thus know how big it is) until we've got a
00180  * place to put the code.  So we cheat:  we compile it twice, once with code
00181  * generation turned off and size counting turned on, and once "for real".
00182  * This also means that we don't allocate space until we are sure that the
00183  * thing really will compile successfully, and we never have to move the
00184  * code and thus invalidate pointers into it.  (Note that it has to be in
00185  * one piece because free() must be able to free it all.)
00186  *
00187  * Beware that the optimization-preparation code in here knows about some
00188  * of the structure of the compiled regexp.
00189  */
00190 regexp *
00191 regcomp(exp)
00192 char *exp;
00193 {
00194         register regexp *r;
00195         register char *scan;
00196         register char *longest;
00197         register int len;
00198         int flags;
00199         extern char *malloc();
00200 
00201         if (exp == NULL)
00202                 FAIL("NULL argument");
00203 
00204         /* First pass: determine size, legality. */
00205         regparse = exp;
00206         regnpar = 1;
00207         regsize = 0L;
00208         regcode = &regdummy;
00209         regc(MAGIC);
00210         if (reg(0, &flags) == NULL)
00211                 return(NULL);
00212 
00213         /* Small enough for pointer-storage convention? */
00214         if (regsize >= 32767L)          /* Probably could be 65535L. */
00215                 FAIL("regexp too big");
00216 
00217         /* Allocate space. */
00218         r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize);
00219         if (r == NULL)
00220                 FAIL("out of space");
00221 
00222         /* Second pass: emit code. */
00223         regparse = exp;
00224         regnpar = 1;
00225         regcode = r->program;
00226         regc(MAGIC);
00227         if (reg(0, &flags) == NULL)
00228                 return(NULL);
00229 
00230         /* Dig out information for optimizations. */
00231         r->regstart = '\0';     /* Worst-case defaults. */
00232         r->reganch = 0;
00233         r->regmust = NULL;
00234         r->regmlen = 0;
00235         scan = r->program+1;                    /* First BRANCH. */
00236         if (OP(regnext(scan)) == END) {         /* Only one top-level choice. */
00237                 scan = OPERAND(scan);
00238 
00239                 /* Starting-point info. */
00240                 if (OP(scan) == EXACTLY)
00241                         r->regstart = *OPERAND(scan);
00242                 else if (OP(scan) == BOL)
00243                         r->reganch++;
00244 
00245                 /*
00246                  * If there's something expensive in the r.e., find the
00247                  * longest literal string that must appear and make it the
00248                  * regmust.  Resolve ties in favor of later strings, since
00249                  * the regstart check works with the beginning of the r.e.
00250                  * and avoiding duplication strengthens checking.  Not a
00251                  * strong reason, but sufficient in the absence of others.
00252                  */
00253                 if (flags&SPSTART) {
00254                         longest = NULL;
00255                         len = 0;
00256                         for (; scan != NULL; scan = regnext(scan))
00257                                 if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) {
00258                                         longest = OPERAND(scan);
00259                                         len = strlen(OPERAND(scan));
00260                                 }
00261                         r->regmust = longest;
00262                         r->regmlen = len;
00263                 }
00264         }
00265 
00266         return(r);
00267 }
00268 
00269 /*
00270  - reg - regular expression, i.e. main body or parenthesized thing
00271  *
00272  * Caller must absorb opening parenthesis.
00273  *
00274  * Combining parenthesis handling with the base level of regular expression
00275  * is a trifle forced, but the need to tie the tails of the branches to what
00276  * follows makes it hard to avoid.
00277  */
00278 static char *
00279 reg(paren, flagp)
00280 int paren;                      /* Parenthesized? */
00281 int *flagp;
00282 {
00283         register char *ret;
00284         register char *br;
00285         register char *ender;
00286         register int parno;
00287         int flags;
00288 
00289         *flagp = HASWIDTH;      /* Tentatively. */
00290 
00291         /* Make an OPEN node, if parenthesized. */
00292         if (paren) {
00293                 if (regnpar >= NSUBEXP)
00294                         FAIL("too many ()");
00295                 parno = regnpar;
00296                 regnpar++;
00297                 ret = regnode(OPEN+parno);
00298         } else
00299                 ret = NULL;
00300 
00301         /* Pick up the branches, linking them together. */
00302         br = regbranch(&flags);
00303         if (br == NULL)
00304                 return(NULL);
00305         if (ret != NULL)
00306                 regtail(ret, br);       /* OPEN -> first. */
00307         else
00308                 ret = br;
00309         if (!(flags&HASWIDTH))
00310                 *flagp &= ~HASWIDTH;
00311         *flagp |= flags&SPSTART;
00312         while (*regparse == '|') {
00313                 regparse++;
00314                 br = regbranch(&flags);
00315                 if (br == NULL)
00316                         return(NULL);
00317                 regtail(ret, br);       /* BRANCH -> BRANCH. */
00318                 if (!(flags&HASWIDTH))
00319                         *flagp &= ~HASWIDTH;
00320                 *flagp |= flags&SPSTART;
00321         }
00322 
00323         /* Make a closing node, and hook it on the end. */
00324         ender = regnode((paren) ? CLOSE+parno : END);   
00325         regtail(ret, ender);
00326 
00327         /* Hook the tails of the branches to the closing node. */
00328         for (br = ret; br != NULL; br = regnext(br))
00329                 regoptail(br, ender);
00330 
00331         /* Check for proper termination. */
00332         if (paren && *regparse++ != ')') {
00333                 FAIL("unmatched ()");
00334         } else if (!paren && *regparse != '\0') {
00335                 if (*regparse == ')') {
00336                         FAIL("unmatched ()");
00337                 } else
00338                         FAIL("junk on end");    /* "Can't happen". */
00339                 /* NOTREACHED */
00340         }
00341 
00342         return(ret);
00343 }
00344 
00345 /*
00346  - regbranch - one alternative of an | operator
00347  *
00348  * Implements the concatenation operator.
00349  */
00350 static char *
00351 regbranch(flagp)
00352 int *flagp;
00353 {
00354         register char *ret;
00355         register char *chain;
00356         register char *latest;
00357         int flags;
00358 
00359         *flagp = WORST;         /* Tentatively. */
00360 
00361         ret = regnode(BRANCH);
00362         chain = NULL;
00363         while (*regparse != '\0' && *regparse != '|' && *regparse != ')') {
00364                 latest = regpiece(&flags);
00365                 if (latest == NULL)
00366                         return(NULL);
00367                 *flagp |= flags&HASWIDTH;
00368                 if (chain == NULL)      /* First piece. */
00369                         *flagp |= flags&SPSTART;
00370                 else
00371                         regtail(chain, latest);
00372                 chain = latest;
00373         }
00374         if (chain == NULL)      /* Loop ran zero times. */
00375                 (void) regnode(NOTHING);
00376 
00377         return(ret);
00378 }
00379 
00380 /*
00381  - regpiece - something followed by possible [*+?]
00382  *
00383  * Note that the branching code sequences used for ? and the general cases
00384  * of * and + are somewhat optimized:  they use the same NOTHING node as
00385  * both the endmarker for their branch list and the body of the last branch.
00386  * It might seem that this node could be dispensed with entirely, but the
00387  * endmarker role is not redundant.
00388  */
00389 static char *
00390 regpiece(flagp)
00391 int *flagp;
00392 {
00393         register char *ret;
00394         register char op;
00395         register char *next;
00396         int flags;
00397 
00398         ret = regatom(&flags);
00399         if (ret == NULL)
00400                 return(NULL);
00401 
00402         op = *regparse;
00403         if (!ISMULT(op)) {
00404                 *flagp = flags;
00405                 return(ret);
00406         }
00407 
00408         if (!(flags&HASWIDTH) && op != '?')
00409                 FAIL("*+ operand could be empty");
00410         *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
00411 
00412         if (op == '*' && (flags&SIMPLE))
00413                 reginsert(STAR, ret);
00414         else if (op == '*') {
00415                 /* Emit x* as (x&|), where & means "self". */
00416                 reginsert(BRANCH, ret);                 /* Either x */
00417                 regoptail(ret, regnode(BACK));          /* and loop */
00418                 regoptail(ret, ret);                    /* back */
00419                 regtail(ret, regnode(BRANCH));          /* or */
00420                 regtail(ret, regnode(NOTHING));         /* null. */
00421         } else if (op == '+' && (flags&SIMPLE))
00422                 reginsert(PLUS, ret);
00423         else if (op == '+') {
00424                 /* Emit x+ as x(&|), where & means "self". */
00425                 next = regnode(BRANCH);                 /* Either */
00426                 regtail(ret, next);
00427                 regtail(regnode(BACK), ret);            /* loop back */
00428                 regtail(next, regnode(BRANCH));         /* or */
00429                 regtail(ret, regnode(NOTHING));         /* null. */
00430         } else if (op == '?') {
00431                 /* Emit x? as (x|) */
00432                 reginsert(BRANCH, ret);                 /* Either x */
00433                 regtail(ret, regnode(BRANCH));          /* or */
00434                 next = regnode(NOTHING);                /* null. */
00435                 regtail(ret, next);
00436                 regoptail(ret, next);
00437         }
00438         regparse++;
00439         if (ISMULT(*regparse))
00440                 FAIL("nested *?+");
00441 
00442         return(ret);
00443 }
00444 
00445 /*
00446  - regatom - the lowest level
00447  *
00448  * Optimization:  gobbles an entire sequence of ordinary characters so that
00449  * it can turn them into a single node, which is smaller to store and
00450  * faster to run.  Backslashed characters are exceptions, each becoming a
00451  * separate node; the code is simpler that way and it's not worth fixing.
00452  */
00453 static char *
00454 regatom(flagp)
00455 int *flagp;
00456 {
00457         register char *ret;
00458         int flags;
00459 
00460         *flagp = WORST;         /* Tentatively. */
00461 
00462         switch (*regparse++) {
00463         case '^':
00464                 ret = regnode(BOL);
00465                 break;
00466         case '$':
00467                 ret = regnode(EOL);
00468                 break;
00469         case '.':
00470                 ret = regnode(ANY);
00471                 *flagp |= HASWIDTH|SIMPLE;
00472                 break;
00473         case '[': {
00474                         register int class;
00475                         register int classend;
00476 
00477                         if (*regparse == '^') { /* Complement of range. */
00478                                 ret = regnode(ANYBUT);
00479                                 regparse++;
00480                         } else
00481                                 ret = regnode(ANYOF);
00482                         if (*regparse == ']' || *regparse == '-')
00483                                 regc(*regparse++);
00484                         while (*regparse != '\0' && *regparse != ']') {
00485                                 if (*regparse == '-') {
00486                                         regparse++;
00487                                         if (*regparse == ']' || *regparse == '\0')
00488                                                 regc('-');
00489                                         else {
00490                                                 class = UCHARAT(regparse-2)+1;
00491                                                 classend = UCHARAT(regparse);
00492                                                 if (class > classend+1)
00493                                                         FAIL("invalid [] range");
00494                                                 for (; class <= classend; class++)
00495                                                         regc(class);
00496                                                 regparse++;
00497                                         }
00498                                 } else
00499                                         regc(*regparse++);
00500                         }
00501                         regc('\0');
00502                         if (*regparse != ']')
00503                                 FAIL("unmatched []");
00504                         regparse++;
00505                         *flagp |= HASWIDTH|SIMPLE;
00506                 }
00507                 break;
00508         case '(':
00509                 ret = reg(1, &flags);
00510                 if (ret == NULL)
00511                         return(NULL);
00512                 *flagp |= flags&(HASWIDTH|SPSTART);
00513                 break;
00514         case '\0':
00515         case '|':
00516         case ')':
00517                 FAIL("internal urp");   /* Supposed to be caught earlier. */
00518                 break;
00519         case '?':
00520         case '+':
00521         case '*':
00522                 FAIL("?+* follows nothing");
00523                 break;
00524         case '\\':
00525                 if (*regparse == '\0')
00526                         FAIL("trailing \\");
00527                 ret = regnode(EXACTLY);
00528                 regc(*regparse++);
00529                 regc('\0');
00530                 *flagp |= HASWIDTH|SIMPLE;
00531                 break;
00532         default: {
00533                         register int len;
00534                         register char ender;
00535 
00536                         regparse--;
00537                         len = strcspn(regparse, META);
00538                         if (len <= 0)
00539                                 FAIL("internal disaster");
00540                         ender = *(regparse+len);
00541                         if (len > 1 && ISMULT(ender))
00542                                 len--;          /* Back off clear of ?+* operand. */
00543                         *flagp |= HASWIDTH;
00544                         if (len == 1)
00545                                 *flagp |= SIMPLE;
00546                         ret = regnode(EXACTLY);
00547                         while (len > 0) {
00548                                 regc(*regparse++);
00549                                 len--;
00550                         }
00551                         regc('\0');
00552                 }
00553                 break;
00554         }
00555 
00556         return(ret);
00557 }
00558 
00559 /*
00560  - regnode - emit a node
00561  */
00562 static char *                   /* Location. */
00563 regnode(op)
00564 char op;
00565 {
00566         register char *ret;
00567         register char *ptr;
00568 
00569         ret = regcode;
00570         if (ret == &regdummy) {
00571                 regsize += 3;
00572                 return(ret);
00573         }
00574 
00575         ptr = ret;
00576         *ptr++ = op;
00577         *ptr++ = '\0';          /* Null "next" pointer. */
00578         *ptr++ = '\0';
00579         regcode = ptr;
00580 
00581         return(ret);
00582 }
00583 
00584 /*
00585  - regc - emit (if appropriate) a byte of code
00586  */
00587 static void
00588 regc(b)
00589 char b;
00590 {
00591         if (regcode != &regdummy)
00592                 *regcode++ = b;
00593         else
00594                 regsize++;
00595 }
00596 
00597 /*
00598  - reginsert - insert an operator in front of already-emitted operand
00599  *
00600  * Means relocating the operand.
00601  */
00602 static void
00603 reginsert(op, opnd)
00604 char op;
00605 char *opnd;
00606 {
00607         register char *src;
00608         register char *dst;
00609         register char *place;
00610 
00611         if (regcode == &regdummy) {
00612                 regsize += 3;
00613                 return;
00614         }
00615 
00616         src = regcode;
00617         regcode += 3;
00618         dst = regcode;
00619         while (src > opnd)
00620                 *--dst = *--src;
00621 
00622         place = opnd;           /* Op node, where operand used to be. */
00623         *place++ = op;
00624         *place++ = '\0';
00625         *place++ = '\0';
00626 }
00627 
00628 /*
00629  - regtail - set the next-pointer at the end of a node chain
00630  */
00631 static void
00632 regtail(p, val)
00633 char *p;
00634 char *val;
00635 {
00636         register char *scan;
00637         register char *temp;
00638         register int offset;
00639 
00640         if (p == &regdummy)
00641                 return;
00642 
00643         /* Find last node. */
00644         scan = p;
00645         for (;;) {
00646                 temp = regnext(scan);
00647                 if (temp == NULL)
00648                         break;
00649                 scan = temp;
00650         }
00651 
00652         if (OP(scan) == BACK)
00653                 offset = scan - val;
00654         else
00655                 offset = val - scan;
00656         *(scan+1) = (offset>>8)&0377;
00657         *(scan+2) = offset&0377;
00658 }
00659 
00660 /*
00661  - regoptail - regtail on operand of first argument; nop if operandless
00662  */
00663 static void
00664 regoptail(p, val)
00665 char *p;
00666 char *val;
00667 {
00668         /* "Operandless" and "op != BRANCH" are synonymous in practice. */
00669         if (p == NULL || p == &regdummy || OP(p) != BRANCH)
00670                 return;
00671         regtail(OPERAND(p), val);
00672 }
00673 
00674 /*
00675  * regexec and friends
00676  */
00677 
00678 /*
00679  * Global work variables for regexec().
00680  */
00681 static char *reginput;          /* String-input pointer. */
00682 static char *regbol;            /* Beginning of input, for ^ check. */
00683 static char **regstartp;        /* Pointer to startp array. */
00684 static char **regendp;          /* Ditto for endp. */
00685 
00686 /*
00687  * Forwards.
00688  */
00689 STATIC int regtry();
00690 STATIC int regmatch();
00691 STATIC int regrepeat();
00692 
00693 #ifdef DEBUG
00694 int regnarrate = 0;
00695 void regdump();
00696 STATIC char *regprop();
00697 #endif
00698 
00699 /*
00700  - regexec - match a regexp against a string
00701  */
00702 int
00703 regexec(prog, string)
00704 register regexp *prog;
00705 register char *string;
00706 {
00707         register char *s;
00708         extern char *strchr();
00709 
00710         /* Be paranoid... */
00711         if (prog == NULL || string == NULL) {
00712                 regerror("NULL parameter");
00713                 return(0);
00714         }
00715 
00716         /* Check validity of program. */
00717         if (UCHARAT(prog->program) != MAGIC) {
00718                 regerror("corrupted program");
00719                 return(0);
00720         }
00721 
00722         /* If there is a "must appear" string, look for it. */
00723         if (prog->regmust != NULL) {
00724                 s = string;
00725                 while ((s = strchr(s, prog->regmust[0])) != NULL) {
00726                         if (strncmp(s, prog->regmust, prog->regmlen) == 0)
00727                                 break;  /* Found it. */
00728                         s++;
00729                 }
00730                 if (s == NULL)  /* Not present. */
00731                         return(0);
00732         }
00733 
00734         /* Mark beginning of line for ^ . */
00735         regbol = string;
00736 
00737         /* Simplest case:  anchored match need be tried only once. */
00738         if (prog->reganch)
00739                 return(regtry(prog, string));
00740 
00741         /* Messy cases:  unanchored match. */
00742         s = string;
00743         if (prog->regstart != '\0')
00744                 /* We know what char it must start with. */
00745                 while ((s = strchr(s, prog->regstart)) != NULL) {
00746                         if (regtry(prog, s))
00747                                 return(1);
00748                         s++;
00749                 }
00750         else
00751                 /* We don't -- general case. */
00752                 do {
00753                         if (regtry(prog, s))
00754                                 return(1);
00755                 } while (*s++ != '\0');
00756 
00757         /* Failure. */
00758         return(0);
00759 }
00760 
00761 /*
00762  - regtry - try match at specific point
00763  */
00764 static int                      /* 0 failure, 1 success */
00765 regtry(prog, string)
00766 regexp *prog;
00767 char *string;
00768 {
00769         register int i;
00770         register char **sp;
00771         register char **ep;
00772 
00773         reginput = string;
00774         regstartp = prog->startp;
00775         regendp = prog->endp;
00776 
00777         sp = prog->startp;
00778         ep = prog->endp;
00779         for (i = NSUBEXP; i > 0; i--) {
00780                 *sp++ = NULL;
00781                 *ep++ = NULL;
00782         }
00783         if (regmatch(prog->program + 1)) {
00784                 prog->startp[0] = string;
00785                 prog->endp[0] = reginput;
00786                 return(1);
00787         } else
00788                 return(0);
00789 }
00790 
00791 /*
00792  - regmatch - main matching routine
00793  *
00794  * Conceptually the strategy is simple:  check to see whether the current
00795  * node matches, call self recursively to see whether the rest matches,
00796  * and then act accordingly.  In practice we make some effort to avoid
00797  * recursion, in particular by going through "ordinary" nodes (that don't
00798  * need to know whether the rest of the match failed) by a loop instead of
00799  * by recursion.
00800  */
00801 static int                      /* 0 failure, 1 success */
00802 regmatch(prog)
00803 char *prog;
00804 {
00805         register char *scan;    /* Current node. */
00806         char *next;             /* Next node. */
00807         extern char *strchr();
00808 
00809         scan = prog;
00810 #ifdef DEBUG
00811         if (scan != NULL && regnarrate)
00812                 fprintf(stderr, "%s(\n", regprop(scan));
00813 #endif
00814         while (scan != NULL) {
00815 #ifdef DEBUG
00816                 if (regnarrate)
00817                         fprintf(stderr, "%s...\n", regprop(scan));
00818 #endif
00819                 next = regnext(scan);
00820 
00821                 switch (OP(scan)) {
00822                 case BOL:
00823                         if (reginput != regbol)
00824                                 return(0);
00825                         break;
00826                 case EOL:
00827                         if (*reginput != '\0')
00828                                 return(0);
00829                         break;
00830                 case ANY:
00831                         if (*reginput == '\0')
00832                                 return(0);
00833                         reginput++;
00834                         break;
00835                 case EXACTLY: {
00836                                 register int len;
00837                                 register char *opnd;
00838 
00839                                 opnd = OPERAND(scan);
00840                                 /* Inline the first character, for speed. */
00841                                 if (*opnd != *reginput)
00842                                         return(0);
00843                                 len = strlen(opnd);
00844                                 if (len > 1 && strncmp(opnd, reginput, len) != 0)
00845                                         return(0);
00846                                 reginput += len;
00847                         }
00848                         break;
00849                 case ANYOF:
00850                         if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL)
00851                                 return(0);
00852                         reginput++;
00853                         break;
00854                 case ANYBUT:
00855                         if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL)
00856                                 return(0);
00857                         reginput++;
00858                         break;
00859                 case NOTHING:
00860                         break;
00861                 case BACK:
00862                         break;
00863                 case OPEN+1:
00864                 case OPEN+2:
00865                 case OPEN+3:
00866                 case OPEN+4:
00867                 case OPEN+5:
00868                 case OPEN+6:
00869                 case OPEN+7:
00870                 case OPEN+8:
00871                 case OPEN+9: {
00872                                 register int no;
00873                                 register char *save;
00874 
00875                                 no = OP(scan) - OPEN;
00876                                 save = reginput;
00877 
00878                                 if (regmatch(next)) {
00879                                         /*
00880                                          * Don't set startp if some later
00881                                          * invocation of the same parentheses
00882                                          * already has.
00883                                          */
00884                                         if (regstartp[no] == NULL)
00885                                                 regstartp[no] = save;
00886                                         return(1);
00887                                 } else
00888                                         return(0);
00889                         }
00890                         break;
00891                 case CLOSE+1:
00892                 case CLOSE+2:
00893                 case CLOSE+3:
00894                 case CLOSE+4:
00895                 case CLOSE+5:
00896                 case CLOSE+6:
00897                 case CLOSE+7:
00898                 case CLOSE+8:
00899                 case CLOSE+9: {
00900                                 register int no;
00901                                 register char *save;
00902 
00903                                 no = OP(scan) - CLOSE;
00904                                 save = reginput;
00905 
00906                                 if (regmatch(next)) {
00907                                         /*
00908                                          * Don't set endp if some later
00909                                          * invocation of the same parentheses
00910                                          * already has.
00911                                          */
00912                                         if (regendp[no] == NULL)
00913                                                 regendp[no] = save;
00914                                         return(1);
00915                                 } else
00916                                         return(0);
00917                         }
00918                         break;
00919                 case BRANCH: {
00920                                 register char *save;
00921 
00922                                 if (OP(next) != BRANCH)         /* No choice. */
00923                                         next = OPERAND(scan);   /* Avoid recursion. */
00924                                 else {
00925                                         do {
00926                                                 save = reginput;
00927                                                 if (regmatch(OPERAND(scan)))
00928                                                         return(1);
00929                                                 reginput = save;
00930                                                 scan = regnext(scan);
00931                                         } while (scan != NULL && OP(scan) == BRANCH);
00932                                         return(0);
00933                                         /* NOTREACHED */
00934                                 }
00935                         }
00936                         break;
00937                 case STAR:
00938                 case PLUS: {
00939                                 register char nextch;
00940                                 register int no;
00941                                 register char *save;
00942                                 register int min;
00943 
00944                                 /*
00945                                  * Lookahead to avoid useless match attempts
00946                                  * when we know what character comes next.
00947                                  */
00948                                 nextch = '\0';
00949                                 if (OP(next) == EXACTLY)
00950                                         nextch = *OPERAND(next);
00951                                 min = (OP(scan) == STAR) ? 0 : 1;
00952                                 save = reginput;
00953                                 no = regrepeat(OPERAND(scan));
00954                                 while (no >= min) {
00955                                         /* If it could work, try it. */
00956                                         if (nextch == '\0' || *reginput == nextch)
00957                                                 if (regmatch(next))
00958                                                         return(1);
00959                                         /* Couldn't or didn't -- back up. */
00960                                         no--;
00961                                         reginput = save + no;
00962                                 }
00963                                 return(0);
00964                         }
00965                         break;
00966                 case END:
00967                         return(1);      /* Success! */
00968                         break;
00969                 default:
00970                         regerror("memory corruption");
00971                         return(0);
00972                         break;
00973                 }
00974 
00975                 scan = next;
00976         }
00977 
00978         /*
00979          * We get here only if there's trouble -- normally "case END" is
00980          * the terminating point.
00981          */
00982         regerror("corrupted pointers");
00983         return(0);
00984 }
00985 
00986 /*
00987  - regrepeat - repeatedly match something simple, report how many
00988  */
00989 static int
00990 regrepeat(p)
00991 char *p;
00992 {
00993         register int count = 0;
00994         register char *scan;
00995         register char *opnd;
00996 
00997         scan = reginput;
00998         opnd = OPERAND(p);
00999         switch (OP(p)) {
01000         case ANY:
01001                 count = strlen(scan);
01002                 scan += count;
01003                 break;
01004         case EXACTLY:
01005                 while (*opnd == *scan) {
01006                         count++;
01007                         scan++;
01008                 }
01009                 break;
01010         case ANYOF:
01011                 while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
01012                         count++;
01013                         scan++;
01014                 }
01015                 break;
01016         case ANYBUT:
01017                 while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
01018                         count++;
01019                         scan++;
01020                 }
01021                 break;
01022         default:                /* Oh dear.  Called inappropriately. */
01023                 regerror("internal foulup");
01024                 count = 0;      /* Best compromise. */
01025                 break;
01026         }
01027         reginput = scan;
01028 
01029         return(count);
01030 }
01031 
01032 /*
01033  - regnext - dig the "next" pointer out of a node
01034  */
01035 static char *
01036 regnext(p)
01037 register char *p;
01038 {
01039         register int offset;
01040 
01041         if (p == &regdummy)
01042                 return(NULL);
01043 
01044         offset = NEXT(p);
01045         if (offset == 0)
01046                 return(NULL);
01047 
01048         if (OP(p) == BACK)
01049                 return(p-offset);
01050         else
01051                 return(p+offset);
01052 }
01053 
01054 #ifdef DEBUG
01055 
01056 STATIC char *regprop();
01057 
01058 /*
01059  - regdump - dump a regexp onto stdout in vaguely comprehensible form
01060  */
01061 void
01062 regdump(r)
01063 regexp *r;
01064 {
01065         register char *s;
01066         register char op = EXACTLY;     /* Arbitrary non-END op. */
01067         register char *next;
01068         extern char *strchr();
01069 
01070 
01071         s = r->program + 1;
01072         while (op != END) {     /* While that wasn't END last time... */
01073                 op = OP(s);
01074                 printf("%2d%s", s-r->program, regprop(s));      /* Where, what. */
01075                 next = regnext(s);
01076                 if (next == NULL)               /* Next ptr. */
01077                         printf("(0)");
01078                 else 
01079                         printf("(%d)", (s-r->program)+(next-s));
01080                 s += 3;
01081                 if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
01082                         /* Literal string, where present. */
01083                         while (*s != '\0') {
01084                                 putchar(*s);
01085                                 s++;
01086                         }
01087                         s++;
01088                 }
01089                 putchar('\n');
01090         }
01091 
01092         /* Header fields of interest. */
01093         if (r->regstart != '\0')
01094                 printf("start `%c' ", r->regstart);
01095         if (r->reganch)
01096                 printf("anchored ");
01097         if (r->regmust != NULL)
01098                 printf("must have \"%s\"", r->regmust);
01099         printf("\n");
01100 }
01101 
01102 /*
01103  - regprop - printable representation of opcode
01104  */
01105 static char *
01106 regprop(op)
01107 char *op;
01108 {
01109         register char *p;
01110         static char buf[50];
01111 
01112         (void) strcpy(buf, ":");
01113 
01114         switch (OP(op)) {
01115         case BOL:
01116                 p = "BOL";
01117                 break;
01118         case EOL:
01119                 p = "EOL";
01120                 break;
01121         case ANY:
01122                 p = "ANY";
01123                 break;
01124         case ANYOF:
01125                 p = "ANYOF";
01126                 break;
01127         case ANYBUT:
01128                 p = "ANYBUT";
01129                 break;
01130         case BRANCH:
01131                 p = "BRANCH";
01132                 break;
01133         case EXACTLY:
01134                 p = "EXACTLY";
01135                 break;
01136         case NOTHING:
01137                 p = "NOTHING";
01138                 break;
01139         case BACK:
01140                 p = "BACK";
01141                 break;
01142         case END:
01143                 p = "END";
01144                 break;
01145         case OPEN+1:
01146         case OPEN+2:
01147         case OPEN+3:
01148         case OPEN+4:
01149         case OPEN+5:
01150         case OPEN+6:
01151         case OPEN+7:
01152         case OPEN+8:
01153         case OPEN+9:
01154                 sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
01155                 p = NULL;
01156                 break;
01157         case CLOSE+1:
01158         case CLOSE+2:
01159         case CLOSE+3:
01160         case CLOSE+4:
01161         case CLOSE+5:
01162         case CLOSE+6:
01163         case CLOSE+7:
01164         case CLOSE+8:
01165         case CLOSE+9:
01166                 sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
01167                 p = NULL;
01168                 break;
01169         case STAR:
01170                 p = "STAR";
01171                 break;
01172         case PLUS:
01173                 p = "PLUS";
01174                 break;
01175         default:
01176                 regerror("corrupted opcode");
01177                 break;
01178         }
01179         if (p != NULL)
01180                 (void) strcat(buf, p);
01181         return(buf);
01182 }
01183 #endif
01184 
01185 /*
01186  * The following is provided for those people who do not have strcspn() in
01187  * their C libraries.  They should get off their butts and do something
01188  * about it; at least one public-domain implementation of those (highly
01189  * useful) string routines has been published on Usenet.
01190  */
01191 #ifdef STRCSPN
01192 /*
01193  * strcspn - find length of initial segment of s1 consisting entirely
01194  * of characters not from s2
01195  */
01196 
01197 static int
01198 strcspn(s1, s2)
01199 char *s1;
01200 char *s2;
01201 {
01202         register char *scan1;
01203         register char *scan2;
01204         register int count;
01205 
01206         count = 0;
01207         for (scan1 = s1; *scan1 != '\0'; scan1++) {
01208                 for (scan2 = s2; *scan2 != '\0';)       /* ++ moved down. */
01209                         if (*scan1 == *scan2++)
01210                                 return(count);
01211                 count++;
01212         }
01213         return(count);
01214 }
01215 #endif
01216 
01217 /* In order to be able to copy a regexp, we need to know its size
01218    Denys Duchier, Dec 13, 1994 */
01219 
01220 long
01221 last_regsize() { return sizeof(regexp) + regsize; }

Generated on Sat Jan 26 08:48:07 2008 for WildLife by  doxygen 1.5.4