Goto sanos source index

//
// regex2.h
//
// Internal definitions for regular expression library
//
// Ported to sanos by Michael Ringgaard.
//
// Copyright 1992, 1993, 1994, 1997 Henry Spencer.  All rights reserved.
// This software is not subject to any license of the American Telephone
// and Telegraph Company or of the Regents of the University of California.
// 
// Permission is granted to anyone to use this software for any purpose on
// any computer system, and to alter it and redistribute it, subject
// to the following restrictions:
// 
// 1. The author is not responsible for the consequences of use of this
//    software, no matter how awful, even if they arise from flaws in it.
// 
// 2. The origin of this software must not be misrepresented, either by
//    explicit claim or by omission.  Since few users ever read sources,
//    credits must appear in the documentation.
// 
// 3. Altered versions must be plainly marked as such, and must not be
//    misrepresented as being the original software.  Since few users
//    ever read sources, credits must appear in the documentation.
// 
// 4. This notice may not be removed or altered.
// 

//
// The internal representation is a *strip*, a sequence of
// operators ending with an endmarker.  (Some terminology etc. is a
// historical relic of earlier versions which used multiple strips.)
// Certain oddities in the representation are there to permit running
// the machinery backwards; in particular, any deviation from sequential
// flow must be marked at both its source and its destination.  Some
// fine points:
//
// - OPLUS_ and O_PLUS are *inside* the loop they create.
// - OQUEST_ and O_QUEST are *outside* the bypass they create.
// - OCH_ and O_CH are *outside* the multi-way branch they create, while
//   OOR1 and OOR2 are respectively the end and the beginning of one of
//   the branches.  Note that there is an implicit OOR2 following OCH_
//   and an implicit OOR1 preceding O_CH.
//
// In state representations, an operator's bit is on to signify a state
// immediately *preceding* "execution" of that operator.
//

typedef unsigned char uch;

typedef long sop;   // strip operator
typedef long sopno;

#define OPRMASK 0x7c000000
#define OPDMASK 0x03ffffff
#define OPSHIFT (26)

#define OP(n)         ((n) & OPRMASK)
#define OPND(n)       ((n) & OPDMASK)
#define SOP(op, opnd) ((op) | (opnd))

// operators                       meaning      operand
//                                              (back, fwd are offsets)
#define OEND    (1 << OPSHIFT)  // endmarker    -
#define OCHAR   (2 << OPSHIFT)  // character    unsigned char
#define OBOL    (3 << OPSHIFT)  // left anchor  -
#define OEOL    (4 << OPSHIFT)  // right anchor -
#define OANY    (5 << OPSHIFT)  // .            -    
#define OANYOF  (6 << OPSHIFT)  // [...]        set number
#define OBACK_  (7 << OPSHIFT)  // begin \d     paren number
#define O_BACK  (8 << OPSHIFT)  // end \d       paren number
#define OPLUS_  (9 << OPSHIFT)  // + prefix     fwd to suffix
#define O_PLUS  (10 << OPSHIFT) // + suffix     back to prefix
#define OQUEST_ (11 << OPSHIFT) // ? prefix     fwd to suffix
#define O_QUEST (12 << OPSHIFT) // ? suffix     back to prefix
#define OLPAREN (13 << OPSHIFT) // (            fwd to )
#define ORPAREN (14 << OPSHIFT) // )            back to (
#define OCH_    (15 << OPSHIFT) // begin choice fwd to OOR2
#define OOR1    (16 << OPSHIFT) // | pt. 1      back to OOR1 or OCH_
#define OOR2    (17 << OPSHIFT) // | pt. 2      fwd to OOR2 or O_CH
#define O_CH    (18 << OPSHIFT) // end choice   back to OOR1
#define OBOW    (19 << OPSHIFT) // begin word   -
#define OEOW    (20 << OPSHIFT) // end word     -

//
// Structure for [] character-set representation.  Character sets are
// done as bit vectors, grouped 8 to a byte vector for compactness.
// The individual set therefore has both a pointer to the byte vector
// and a mask to pick out the relevant bit of each byte.  A hash code
// simplifies testing whether two sets could be identical.
//
// This will get trickier for multicharacter collating elements.  As
// preliminary hooks for dealing with such things, we also carry along
// a string of multi-character elements, and decide the size of the
// vectors at run time.
//
typedef struct {
  uch *ptr;       // -> uch [csetsize]
  uch mask;       // bit within array
  uch hash;       // hash code
  size_t smultis;
  char *multis;   // -> char[smulti]  ab\0cd\0ef\0\0
} cset;

// Note that CHadd and CHsub are unsafe, and CHIN doesn't yield 0/1
#define CHadd(cs, c)  ((cs)->ptr[(uch)(c)] |= (cs)->mask, (cs)->hash += (c))
#define CHsub(cs, c)  ((cs)->ptr[(uch)(c)] &= ~(cs)->mask, (cs)->hash -= (c))
#define CHIN(cs, c) ((cs)->ptr[(uch)(c)] & (cs)->mask)
#define MCadd(p, cs, cp)  mcadd(p, cs, cp)  // regcomp() internal fns
#define MCsub(p, cs, cp)  mcsub(p, cs, cp)
#define MCin(p, cs, cp) mcin(p, cs, cp)

// Stuff for character categories
typedef unsigned char cat_t;

//
// Main compiled-expression structure
//

#define  USEBOL  01  // used ^
#define  USEEOL  02  // used $
#define  BAD     04  // something wrong

struct re_guts {
  int magic;
  sop *strip;            // malloced area for strip
  int csetsize;          // number of bits in a cset vector
  int ncsets;            // number of csets in use
  cset *sets;            // -> cset [ncsets]
  uch *setbits;          // -> uch[csetsize][ncsets/CHAR_BIT]
  int cflags;            // copy of regcomp() cflags argument
  sopno nstates;         // = number of sops
  sopno firststate;      // the initial OEND (normally 0)
  sopno laststate;       // the final OEND
  int iflags;            // internal flags
  int nbol;              // number of ^ used
  int neol;              // number of $ used
  int ncategories;       // how many character categories
  cat_t *categories;     // ->catspace[-CHAR_MIN]
  char *must;            // match must contain this string
  int mlen;              // length of must
  size_t nsub;           // copy of re_nsub
  int backrefs;          // does it use back references?
  sopno nplus;           // how deep does it nest +s?
  // catspace must be last
  cat_t catspace[1];     // actually [NC]
};

#define MAGIC1 ((('r' ^ 0200) << 8) | 'e')
#define MAGIC2 ((('R' ^ 0200) << 8) | 'E')

// Misc utilities
#define OUT (CHAR_MAX + 1)  // a non-character value
#define ISWORD(c) (isalnum(c) || (c) == '_')

#define DUPMAX  255
#define INFINITY  (DUPMAX + 1)
#define NC (CHAR_MAX - CHAR_MIN + 1)

// Switch off assertions (if not already off) if no REDEBUG
#ifndef REDEBUG
#ifndef NDEBUG
#define NDEBUG  // no assertions please
#endif
#endif

#include <assert.h>