RegExpr
provides regular expression matching of text strings
MODULE------------------------------------------------------------ compiling ---; IMPORT Text; REVEAL T = BRANDED "RegExpr.T" REF RECORD body : TEXT; root : CARDINAL; ops : REF ARRAY OF Desc; END; TYPE Op = { Or, And, Concat, AnyString, ThisString }; StrLen = [0..16_ffff]; Desc = RECORD a, b: CARDINAL; (* operands: (left, right) or string: (start, len) *) min, max: StrLen; ch: CHAR; op: Op; END; RegExpr
TYPE ParseState = RECORD body : TEXT; ops : REF ARRAY OF Desc; ch : CHAR; len : CARDINAL; next_ch : CARDINAL; next_buf : CARDINAL; next_op : CARDINAL; END; PROCEDURE------------------------------------------------------------- matching ---Compile (pattern: TEXT): T RAISES {Error} = VAR t: T; s: ParseState; BEGIN IF (pattern = NIL) THEN RAISE Error ("NIL pattern"); END; t := NEW (T, body := pattern); s.len := Text.Length (pattern); IF (s.len < 1) THEN s.next_op := 0; t.ops := NEW (REF ARRAY OF Desc, 1); t.root := EmptyString (s); ELSE s.body := pattern; s.ops := NEW (REF ARRAY OF Desc, s.len + s.len); s.ch := Text.GetChar (pattern, 0); s.next_ch := 1; s.next_buf := 0; s.next_op := 0; t.root := ParseExpr (s); t.ops := s.ops; END; RETURN t; END Compile; PROCEDUREParseExpr (VAR s: ParseState): CARDINAL RAISES {Error} = VAR a, b: CARDINAL; BEGIN a := ParseTerm (s); WHILE (s.ch = '|') DO NextCh (s); b := ParseTerm (s); WITH z = s.ops [s.next_op] DO z.op := Op.Or; z.a := a; z.b := b; z.min := MIN (s.ops[a].min, s.ops[b].min); z.max := MAX (s.ops[b].max, s.ops[b].max); END; a := s.next_op; INC (s.next_op); END; RETURN a; END ParseExpr; PROCEDUREParseTerm (VAR s: ParseState): CARDINAL RAISES {Error} = VAR a, b: CARDINAL; BEGIN a := ParseFactor (s); WHILE (s.ch = '&') DO NextCh (s); b := ParseFactor (s); WITH z = s.ops [s.next_op] DO z.op := Op.And; z.a := a; z.b := b; z.min := MAX (s.ops[a].min, s.ops[b].min); z.max := MIN (s.ops[b].max, s.ops[b].max); END; a := s.next_op; INC (s.next_op); END; RETURN a; END ParseTerm; PROCEDUREParseFactor (VAR s: ParseState): CARDINAL RAISES {Error} = VAR a, b: CARDINAL; BEGIN a := ParsePrimary (s); WHILE (s.next_ch <= s.len) AND (s.ch # '|') AND (s.ch # '&') AND (s.ch # ')') DO b := ParsePrimary (s); WITH z = s.ops [s.next_op] DO z.op := Op.Concat; z.a := a; z.b := b; z.min := MIN (s.ops[a].min + s.ops[b].min, LAST (StrLen)); z.max := MIN (s.ops[a].max + s.ops[b].max, LAST (StrLen)); END; a := s.next_op; INC (s.next_op); END; RETURN a; END ParseFactor; PROCEDUREParsePrimary (VAR s: ParseState): CARDINAL RAISES {Error} = VAR x := s.next_op; BEGIN CASE s.ch OF | '&', '|', ')' => RETURN EmptyString (s); | '*' => NextCh (s); WITH z = s.ops [x] DO z.op := Op.AnyString; z.min := 0; z.max := LAST (StrLen); END; INC (s.next_op); RETURN x; | '@' => NextCh (s); WITH z = s.ops [x] DO z.op := Op.AnyString; z.min := 1; z.max := 1; END; INC (s.next_op); RETURN x; | '(' => NextCh (s); x := ParseExpr (s); IF (s.ch = ')') THEN NextCh (s); (* ok *) ELSE RAISE Error ("unmatched parenthesis"); END; RETURN x; ELSE RETURN ParseString (s); END; END ParsePrimary; PROCEDUREParseString (VAR s: ParseState): CARDINAL = VAR x := s.next_op; BEGIN INC (s.next_op); WITH z = s.ops [x] DO z.op := Op.ThisString; z.a := s.next_buf; z.b := 0; WHILE (s.next_ch <= s.len) AND (s.ch # '|') AND (s.ch # '&') AND (s.ch # '*') AND (s.ch # '@') AND (s.ch # '(') AND (s.ch # ')') DO IF (s.ch = '\134') AND (s.next_ch < s.len) THEN NextCh (s); (* eat the backslash escape *) END; s.ops[s.next_buf].ch := s.ch; INC (s.next_buf); INC (z.b); NextCh (s); END; z.min := z.b; z.max := z.b END; RETURN x; END ParseString; PROCEDUREEmptyString (VAR s: ParseState): CARDINAL = VAR x := s.next_op; BEGIN WITH z = s.ops[x] DO z.op := Op.ThisString; z.a := 0; z.b := 0; z.min := 0; z.max := 0; END; INC (s.next_op); RETURN x; END EmptyString; PROCEDURENextCh (VAR s: ParseState) = BEGIN IF (s.next_ch < s.len) THEN s.ch := Text.GetChar (s.body, s.next_ch); INC (s.next_ch); ELSE s.ch := '\000'; INC (s.next_ch); END; END NextCh;
PROCEDURE----------------------------------------------------------------- misc ---Match (t: T; txt: TEXT): BOOLEAN = BEGIN RETURN MatchSubstring (t, txt, 0, 0); END Match; PROCEDUREMatchSubstring (t: T; txt: TEXT; pre, post: CARDINAL): BOOLEAN = VAR len: INTEGER; buf: ARRAY [0..255] OF CHAR; ref: REF ARRAY OF CHAR; BEGIN IF (t = NIL) THEN RETURN TRUE; END; IF (txt = NIL) THEN RETURN FALSE; END; len := Text.Length (txt) - pre - post; IF (len <= NUMBER (buf)) THEN Text.SetChars (buf, txt, pre); RETURN MatchSub (t, SUBARRAY (buf, 0, len)); ELSE ref := NEW (REF ARRAY OF CHAR, len); Text.SetChars (buf, txt, pre); RETURN MatchSub (t, ref^); END; END MatchSubstring; PROCEDUREMatchSub (t: T; READONLY str: ARRAY OF CHAR): BOOLEAN = BEGIN IF (t = NIL) THEN RETURN TRUE; END; IF (NUMBER (str) <= 0) THEN RETURN FALSE; END; RETURN DoMatch (t.ops, t.root, str, 0, NUMBER (str)); END MatchSub; PROCEDUREDoMatch (ops: REF ARRAY OF Desc; x: CARDINAL; READONLY txt: ARRAY OF CHAR; start, len: INTEGER): BOOLEAN = BEGIN WITH z = ops[x] DO IF (len < z.min) OR (z.max < len) THEN RETURN FALSE; END; CASE z.op OF | Op.Or => RETURN DoMatch (ops, z.a, txt, start, len) OR DoMatch (ops, z.b, txt, start, len); | Op.And => RETURN DoMatch (ops, z.a, txt, start, len) AND DoMatch (ops, z.b, txt, start, len); | Op.AnyString => RETURN TRUE; | Op.ThisString => FOR i := 0 TO z.b - 1 DO IF (ops[z.a + i].ch # txt [start + i]) THEN RETURN FALSE; END; END; RETURN TRUE; | Op.Concat => WITH za = ops[z.a], zb = ops[z.b] DO VAR max_a := MIN (za.max, len - zb.min); min_a := MAX (za.min, len - zb.max); BEGIN FOR i := max_a TO min_a BY -1 DO IF DoMatch (ops, z.a, txt, start, i ) AND DoMatch (ops, z.b, txt, start+i, len-i) THEN RETURN TRUE; END; END; RETURN FALSE; END; END; END; (* CASE*) END; (* WITH *) END DoMatch;
PROCEDURESimpleString (t: T): TEXT = BEGIN IF (t = NIL) THEN RETURN NIL; END; WITH z = t.ops [t.root] DO IF (z.op = Op.ThisString) AND (z.a = 0) AND (z.b = Text.Length (t.body)) THEN RETURN t.body; END; END; RETURN NIL; END SimpleString; BEGIN END RegExpr.