patternmatching/src/RegEx.m3


RegEx.m3 - regex(3) style regular expressions

This module implements regex(3) style regular expressions safe for multiple threads. See man regex for a description of a regex expression.

Edit History: Jan 30 1992 Schilit Created.

Copyright (c) 1991, 1992 Xerox Corporation. All rights reserved.

Use and copying of this software and preparation of derivative works based upon this software are permitted. Any distribution of this software or derivative works must comply with all applicable United States export control laws. This software is made available AS IS, and Xerox Corporation makes no warranty about the software, its performance or its conformity to any specification.

UNSAFE MODULE RegEx;

IMPORT Text, Word, Wr, Fmt, ASCII, Stdio, Thread, TextWr;
IMPORT Cstring;
FROM Wr IMPORT PutChar, PutText;

<*FATAL Thread.Alerted, Wr.Failure*>

VAR debug: BOOLEAN := FALSE;

REVEAL
  Pattern = BRANDED "Pattern version 1" OBJECT
              buff: REF ARRAY OF CHAR;
            METHODS
            END; (* object *)

TYPE
  failure_point = RECORD
                    s       : INTEGER;
                    prev_s  : INTEGER;
                    b       : INTEGER;
                    amount  : INTEGER;
                    previous: REF failure_point;
                  END; (* record *)

CONST
  MODIFIERS = SET OF CHAR{'*', '+', '?'};
  NBBY      = BITSIZE(CHAR);              (* bits in a char *)

CONST
  NUL            = '\000';
  OP_ENDOFPAT    = '\000'; (* sentinel for end of pattern *)
  OP_CHAR        = '\001'; (* a sequence of characters *)
  OP_ANY         = '\002'; (* any character *)
  OP_CHARSET     = '\003'; (* character class [foo] *)
  OP_NOTCHARSET  = '\004'; (* not character class [^foo] *)
  OP_STARTLINE   = '\005'; (* start line ^ *)
  OP_ENDLINE     = '\006'; (* end of line $ *)
  OP_STARTMEMORY = '\007'; (* start memory \( *)
  OP_STOPMEMORY  = '\010'; (* stop memory \) *)
  OP_MEMORY      = '\011'; (* memory reference \DIGIT *)
  MOD_MANY       = 8_100;  (* modifier for * and + *)
  MOD_ZERO       = 8_200;  (* modifier for * and ? *)
  MOD_STAR       = 8_300;  (* combination of above two *)

PROCEDURE Compile (pat: TEXT): Pattern RAISES {Error} =
  VAR
    pat_len      : INTEGER := Text.Length(pat);
    p            : INTEGER := 0;
    buff_len     : INTEGER := 10;
    buff         : REF ARRAY OF CHAR := NEW(REF ARRAY OF CHAR, buff_len);
    b            : INTEGER := 0;
    last_op      : INTEGER := -1;
    last_ch_op   : INTEGER := -1;
    c, cc, next_c: CHAR;
    mem_stop_cnt : INTEGER := 0;
    mem_start_cnt: INTEGER := 0;
    mem_stack: ARRAY [1 .. MEM_SIZE] OF RECORD mem_start_cnt: INTEGER; END;

    mem_stackp: INTEGER := 1;

  PROCEDURE NextChar (): CHAR =
    (* Return a character from the text pattern or NUL if at the end.
       Store into next_c the following character for lookahead. *)
    VAR this_c: CHAR;
    BEGIN
      this_c := next_c;
      IF p < pat_len - 1 THEN
        INC(p);
        next_c := Text.GetChar(pat, p);
      ELSE
        next_c := NUL;
      END; (* if *)
      RETURN this_c;
    END NextChar;

  PROCEDURE Put (c: CHAR) =
    (* Store c into buff[p] expanding buff if necessary.  The buffer
       pointer p is incremented. *)
    BEGIN
      IF b = buff_len - 1 THEN
        buff_len := 2 * buff_len;
        WITH new_buff = NEW(REF ARRAY OF CHAR, buff_len) DO
          FOR i := 0 TO b DO new_buff[i] := buff[i]; END; (* for *)
          buff := new_buff;
        END; (* with *)
      END; (* if *)
      buff[b] := c;
      INC(b);
    END Put;

  PROCEDURE PutChar (c: CHAR) =
    (* Store an OP_CHAR character.  The operand is a char count followed by
       the characters.  If possible coalesce with the previous OP_CHAR. *)
    BEGIN
      IF next_c IN MODIFIERS OR (* this OP_CHAR will have a modifier *)
           last_ch_op = -1 OR (* or no last OP_CHAR exists *)
           ORD(buff[last_ch_op]) + 1 + last_ch_op # b
           OR (* or not adjacent *)
           buff[last_ch_op] = LAST(CHAR) (* or over capacity *)
        THEN
        Put(OP_CHAR); (* New OP_CHAR *)
        IF NOT next_c IN MODIFIERS THEN
          last_ch_op := b; (* remember new index if coalescing ok *)
        END;
        Put(VAL(1, CHAR));
        Put(c);
      ELSE
        INC(buff[last_ch_op]);
        Put(c);
      END; (* if *)
    END PutChar;

  PROCEDURE PutCharSet (c: CHAR) =
    (* Store char c in charset at last_op in buff.  The buff at last_op
       contains OP_CHARSET|OP_NOTCHARSET, count, <bits>.  Set the bit for c
       and increase count if necessary. *)
    BEGIN
      WITH idx = Word.Divide(ORD(c), NBBY),
           ch  = buff[last_op + 2 + idx] DO
        ch :=
          VAL(Word.Or(ORD(ch), Word.LeftShift(1, Word.Mod(ORD(c), NBBY))),
              CHAR);
        (* keep track of size needed for CharSet *)
        IF idx + 1 > ORD(buff[last_op + 1]) THEN
          buff[last_op + 1] := VAL(idx + 1, CHAR);
        END;
      END; (* with *)
    END PutCharSet;

  BEGIN
    (* Initialize next_c for NextChar() use *)
    IF pat_len # 0 THEN next_c := Text.GetChar(pat, 0); END; (* if *)

    (* Loop processing chars from the pattern text and storing the compiled
       pattern in buff[p]. *)
    LOOP
      c := NextChar();

      (* If not a postfix modifier then remember buff[p] index as the "last
         operator" seen *)
      IF NOT c IN MODIFIERS THEN last_op := b; END; (* if *)

      CASE c OF

      | '^' =>
          (* ^ has special meaning as first char in pattern *)
          IF b = 0 THEN Put(OP_STARTLINE); ELSE PutChar('^'); END;

      | '.' => Put(OP_ANY);

      | '*', '+', '?' =>
          IF last_op = -1 THEN
            PutChar(c);
          ELSE
            WITH lastop = buff[last_op] DO
              IF lastop
                   IN SET OF
                        CHAR{OP_STARTMEMORY, OP_STOPMEMORY, OP_STARTLINE} THEN
                last_op := b;
                PutChar(c);
              ELSE
                IF c = '*' THEN
                  buff[last_op] :=
                    VAL(Word.Or(ORD(lastop), MOD_MANY + MOD_ZERO), CHAR);
                ELSIF c = '+' THEN
                  buff[last_op] :=
                    VAL(Word.Or(ORD(lastop), MOD_MANY), CHAR);
                ELSIF c = '?' THEN
                  buff[last_op] :=
                    VAL(Word.Or(ORD(lastop), MOD_ZERO), CHAR);
                END; (* if *)
              END; (* if *)
            END; (* with *)
          END; (* if *)

      | '$' =>
          (* $ has special meaning only at end of pattern *)
          IF next_c # NUL THEN PutChar('$'); ELSE Put(OP_ENDLINE); END;

      | '[' =>
          IF next_c = '^' THEN
            Put(OP_NOTCHARSET);
            EVAL NextChar();
          ELSE
            Put(OP_CHARSET);
          END; (* if *)
          Put(NUL); (* count byte *)

          (* initialize to zero and ensure enough storage in buff *)
          FOR i := 0 TO Word.Divide(ORD(LAST(CHAR)), NBBY) DO
            Put(NUL);
          END; (* for *)

          IF next_c = '-' OR next_c = ']' THEN
            PutCharSet(NextChar());
          END; (* if *)

          REPEAT
            c := NextChar();
            IF c = NUL THEN RAISE Error("Missing ]"); END;
            IF next_c = '-' THEN
              cc := c; (* start of sequence *)
              EVAL NextChar(); (* '-' *)
              IF next_c = ']' THEN
                PutCharSet(cc);
                PutCharSet('-')
              ELSE
                c := NextChar(); (* end of sequence *)
                IF cc > c THEN
                  RAISE Error("Range error in []");
                END; (* if *)
                FOR ch := cc TO c DO PutCharSet(ch); END;
              END; (* if *)
            ELSE
              PutCharSet(c);
            END;
          UNTIL next_c = ']';
          EVAL NextChar();
          (* set b to the min size necessary to store charset *)
          b := last_op + ORD(buff[last_op + 1]) + 2;

      | '\\' =>
          c := NextChar();
          CASE c OF
          | '(' =>
              IF mem_start_cnt >= MEM_SIZE THEN
                RAISE Error("Too many \\(\\) pairs");
              END;
              INC(mem_start_cnt);
              Put(OP_STARTMEMORY);
              Put(VAL(mem_start_cnt, CHAR));
              mem_stack[mem_stackp].mem_start_cnt := mem_start_cnt;
              INC(mem_stackp);

          | ')' =>
              IF mem_stackp <= FIRST(mem_stack) THEN
                RAISE Error("Unmatched \\)");
              END; (* if *)
              DEC(mem_stackp);
              Put(OP_STOPMEMORY);
              Put(VAL(mem_stack[mem_stackp].mem_start_cnt, CHAR));
              INC(mem_stop_cnt);

          | '1' .. '9' =>
              IF mem_stop_cnt < (ORD(c) - ORD('0')) THEN
                RAISE Error("No matching \\) for \\DIGIT");
              END; (* if *)
              Put(OP_MEMORY);
              Put(VAL(ORD(c) - ORD('0'), CHAR));

          ELSE
            PutChar(c);
          END; (* case *)

      | NUL => EXIT; (* end of input *)

      ELSE
        PutChar(c);
      END; (* case *)

    END; (* loop *)

    IF mem_stop_cnt # mem_start_cnt THEN
      RAISE Error("Missing \\)");
    END; (* if *)
    RETURN NEW(Pattern, buff := buff);
  END Compile;

<*INLINE*> PROCEDURE InCharSet (pat: Pattern; pos: INTEGER; c: CHAR):
  BOOLEAN =
  (* Check for character c in charset with count at position pos. *)
  BEGIN
    WITH idx = Word.Divide(ORD(c), NBBY) DO
      (* see if charset contains that bit *)
      IF idx + 1 > ORD(pat.buff[pos]) THEN RETURN FALSE; END;
      (* if so check the bit *)
      RETURN Word.And(ORD(pat.buff[pos + 1 + idx]),
                      Word.LeftShift(1, Word.Mod(ORD(c), NBBY))) # 0;
    END; (* with *)
  END InCharSet;

PROCEDURE Dump (READONLY pat: Pattern): TEXT =
  VAR wr: Wr.T := TextWr.New();
  BEGIN
    FOR b := FIRST(pat.buff^) TO LAST(pat.buff^) DO
      IF pat.buff[b] IN ASCII.AlphaNumerics THEN
        PutChar(wr, pat.buff[b]);
      ELSE
        PutText(wr, "\\" & Fmt.Pad(Fmt.Int(ORD(pat.buff[b]), 8), 3, '0'));
      END; (* if *)
      IF pat.buff[b] = OP_ENDOFPAT THEN EXIT; END;
    END; (* for *)
    RETURN TextWr.ToText(wr);
  END Dump;

PROCEDURE Decompile (READONLY pat: Pattern): TEXT =

  PROCEDURE Decompile_Part (b, to: INTEGER) =
    (* Decompile the pattern and print the result on wr. *)
    VAR
      op              : CHAR;
      many_ok, zero_ok: BOOLEAN;
    BEGIN
      LOOP
        IF b >= to THEN RETURN; END;

        WITH pch = ORD(pat.buff[b]) DO
          op := VAL(Word.And(pch, Word.Not(MOD_STAR)), CHAR);
          many_ok := Word.And(pch, MOD_MANY) # 0;
          zero_ok := Word.And(pch, MOD_ZERO) # 0;
        END; (* with *)

        INC(b);
        CASE op OF

        | OP_ENDOFPAT => EXIT;

        | OP_CHAR =>
            (* Operand is count followed by the characters *)
            WITH last = b + ORD(pat.buff[b]) DO
              INC(b); (* move past count *)
              WHILE b <= last DO
                PutChar(wr, pat.buff[b]);
                INC(b);
              END; (* while *)
            END;
        | OP_ANY => PutChar(wr, '.');

        | OP_CHARSET, OP_NOTCHARSET =>
            (* Operand is count followed by a bit-vector *)
            PutChar(wr, '[');
            IF op = OP_NOTCHARSET THEN PutChar(wr, '^'); END; (* if *)
            FOR c := FIRST(CHAR) TO LAST(CHAR) DO
              IF InCharSet(pat, b, c) THEN PutChar(wr, c); END;
            END;
            INC(b, ORD(pat.buff[b]) + 1);
            PutChar(wr, ']');

        | OP_STARTLINE => PutChar(wr, '^');
        | OP_ENDLINE => PutChar(wr, '$');
        | OP_STARTMEMORY => PutText(wr, "\\("); INC(b);
        | OP_STOPMEMORY => PutText(wr, "\\)"); INC(b);

        | OP_MEMORY =>
            (* Operand is the memory register number (a digit) *)
            PutChar(wr, '\\');
            PutChar(wr, VAL(ORD(pat.buff[b]) + ORD('0'), CHAR));
            INC(b);

        ELSE
          <*ASSERT FALSE*>(* unknown opcode! *)
        END; (* case *)
        IF many_ok AND zero_ok THEN
          PutChar(wr, '*');
        ELSIF many_ok THEN
          PutChar(wr, '+');
        ELSIF zero_ok THEN
          PutChar(wr, '?');
        END;
      END;
    END Decompile_Part;

  VAR wr: Wr.T := TextWr.New();
  BEGIN
    Decompile_Part(0, LAST(pat.buff^));
    RETURN TextWr.ToText(wr);
  END Decompile;

PROCEDURE Execute (READONLY pat    : Pattern;
                            str    : TEXT;
                            start  : CARDINAL := 0;
                            len    : CARDINAL := LAST(CARDINAL);
                            usr_mem: REF Memory := NIL           ):
  INTEGER =
  VAR
    mem    : Memory;
    str_max: CARDINAL;
    str_idx: INTEGER;

  PROCEDURE Advance (s, b: INTEGER): BOOLEAN =
    VAR
      op                      : CHAR;
      many_not_ok, zero_not_ok: BOOLEAN;
      prev_s                  : INTEGER;
      retreat_amt             : INTEGER := 1;
      failure                 : REF failure_point := NIL;
      eq                      : BOOLEAN;
      i                       : INTEGER;

    BEGIN
      IF debug THEN
        PutText(Stdio.stderr,
                "Advance: s = " & Fmt.Int(s) & " b = " & Fmt.Int(b) & "\n");
      END;

      LOOP
        LOOP
          (* fetch the opcode and determine if repeats are allowed *)
          WITH pch = ORD(pat.buff[b]) DO
            op := VAL(Word.And(pch, Word.Not(MOD_STAR)), CHAR);
            many_not_ok := Word.And(pch, MOD_MANY) = 0;
            zero_not_ok := Word.And(pch, MOD_ZERO) = 0;
          END; (* with *)

          (* step past the opcode and remember position *)
          INC(b);
          prev_s := s;

          CASE op OF
          | OP_ENDOFPAT => RETURN TRUE; (* only success if progress *)

          | OP_CHAR =>
              (* compare all characters at once *)
              WITH cnt = ORD(pat.buff[b]) DO
                INC(b); (* move past count *)
                REPEAT
                  (* check if comparison valid *)
                  IF s + cnt > str_max THEN EXIT; END;
                  (* FIXME: inefficient char comparison *)
                  i := 0; eq := TRUE;
                  WHILE eq AND i < cnt DO
                    eq := Text.GetChar(str, s + i) = pat.buff[b + i];
                    INC(i);
                  END;
                  IF NOT eq THEN EXIT END;
                  (* original code
                  (* tradeoff fast for unsafe comparison *)
                  IF Cstring.memcmp(
                       ADR(str[0]) + s, ADR(pat.buff[0]) + b, cnt) # 0 THEN
                    EXIT;
                  END;
                  *)
                  INC(s, cnt);
                UNTIL many_not_ok;
                INC(b, cnt);
              END; (* with *)

          | OP_ANY =>
              IF NOT many_not_ok THEN
                s := str_max;
              ELSIF s < str_max THEN
                INC(s);
              END;

          | OP_CHARSET, OP_NOTCHARSET =>
              REPEAT
                (* check if comparison with data is valid *)
                IF s >= str_max THEN EXIT; END;
                (* See if char present in charset. *)
                IF InCharSet(pat, b, Text.GetChar(str, s)) THEN
                  IF op # OP_CHARSET THEN EXIT; END;
                ELSE
                  IF op # OP_NOTCHARSET THEN EXIT; END;
                END;
                (* step to next char in data and loop if * or + allowed *)
                INC(s);
              UNTIL many_not_ok;
              (* Move past charset in pattern *)
              INC(b, ORD(pat.buff[b]) + 1);

          | OP_STARTLINE => IF s # 0 THEN EXIT END;

          | OP_ENDLINE => IF s # str_max THEN EXIT END;

            (* Process \( and \) which just store the current data index *)

          | OP_STARTMEMORY => mem[ORD(pat.buff[b])].start := s; INC(b);

          | OP_STOPMEMORY => mem[ORD(pat.buff[b])].stop := s; INC(b);

            (* Process \DIGIT or \DIGIT* which is a reference to a
               substring of the data already matched by a \(x\) sequence *)

          | OP_MEMORY =>
              WITH start = mem[ORD(pat.buff[b])].start DO
                (* In case many_ok needs it, set retreat_amt and cur_s for
                   backtracking at end of loop *)
                retreat_amt := mem[ORD(pat.buff[b])].stop - start;
                REPEAT
                  (* check if comparison with data is valid *)
                  IF s + retreat_amt > str_max THEN EXIT; END;
                  (* process one chunk at a time before incrementing s *)
                  i := 0; eq := TRUE;
                  WHILE eq AND i < retreat_amt DO
                    eq := Text.GetChar(str, s + i) =
                          Text.GetChar(str, start + i);
                    INC(i);
                  END;
                  IF NOT eq THEN EXIT END;
                  (* original code
                  IF Cstring.memcmp(
                       ADR(str[0]) + s, ADR(str[0]) + start, retreat_amt)
                       # 0 THEN
                    EXIT;
                  END;
                  *)
                  (* step to chunk char in data and loop if * allowed *)
                  INC(s, retreat_amt);
                UNTIL many_not_ok;
                INC(b);
              END; (* with *)

          ELSE
            <*ASSERT FALSE*>(* unknown opcode! *)
          END; (* case *)

          (* Here after matching one or more component.  If many components
             were eaten then add a failure point from the current position
             back until prev_s. *)

          IF op IN SET OF CHAR{
             OP_CHAR, OP_ANY, OP_CHARSET, OP_NOTCHARSET, OP_MEMORY} THEN
            IF zero_not_ok THEN
              IF prev_s = s AND retreat_amt # 0 THEN EXIT END;
              INC(prev_s, retreat_amt);
            END;
            IF NOT many_not_ok AND prev_s # s THEN
              failure :=
                NEW(REF failure_point, previous := failure, b := b, s := s,
                    prev_s := prev_s, amount := retreat_amt);
            END; (* if *)
            retreat_amt := 1;
          END;

        END; (* loop *)

        (* Here when EXIT from inner loop pop failure point and try
           again *)
        IF failure = NIL THEN RETURN FALSE; END;
        DEC(failure.s, failure.amount);
        IF debug THEN
          PutText(Stdio.stderr,
                  "Failure: b = " & Fmt.Int(b) & " s = " & Fmt.Int(s)
                    & " new b = " & Fmt.Int(failure.b) & " new s = "
                    & Fmt.Int(failure.s) & " prev_s = "
                    & Fmt.Int(failure.prev_s) & "\n");
        END; (* if *)

        s := failure.s;
        b := failure.b;
        IF failure.s <= failure.prev_s THEN
          failure := failure.previous;
        END; (* if *)
      END; (* loop *)
    END Advance;

  BEGIN
    WITH textlen = Text.Length(str) DO
      IF start > textlen THEN
        RETURN -1; (* not found *)
      ELSE
        str_idx := start;
        str_max := MIN(start + len, textlen)
      END;
    END;

    FOR i := FIRST(mem) TO LAST(mem) DO
      mem[i].start := -1;
      mem[i].stop := -1;
    END; (* for *)

    REPEAT
      IF Advance(str_idx, 0) THEN
        IF usr_mem # NIL THEN usr_mem^ := mem; END; (* if *)
        RETURN str_idx; (* return start index *)
      END;
      INC(str_idx);
    UNTIL str_idx >= str_max;
    RETURN -1; (* no match *)
  END Execute;

BEGIN

END RegEx.

interface RegEx is in: