MODULE; IMPORT Rd, Text, Thread, Fmt; IMPORT M3Keyword; CONST EOFChar = '\000'; VAR (* CONST after intialization *) WhiteSpace := ARRAY CHAR OF BOOLEAN { FALSE, .. }; AlphaNumerics := ARRAY CHAR OF BOOLEAN { FALSE, .. }; Digits := ARRAY CHAR OF BOOLEAN { FALSE, .. }; OctalDigits := ARRAY CHAR OF BOOLEAN { FALSE, .. }; HexDigits := ARRAY CHAR OF BOOLEAN { FALSE, .. }; CommentAlert := ARRAY CHAR OF BOOLEAN { FALSE, .. }; CharAlert := ARRAY CHAR OF BOOLEAN { FALSE, .. }; TYPE Public = T OBJECT METHODS initFromRd (source : Rd.T; skip_comments := TRUE; split_pragmas := TRUE): T; initFromBuf (buf : Buf; skip_comments := TRUE; split_pragmas := TRUE): T; END; REVEAL Default = Public BRANDED "M3Scanner.Default" OBJECT skip_comments : BOOLEAN := FALSE; split_pragmas : BOOLEAN := FALSE; line_start : CARDINAL := 0; next_offset : CARDINAL := 0; next_line : CARDINAL := 0; next_line_start : CARDINAL := 0; OVERRIDES next := GetToken; toText := ToText; className := ClassName; initFromRd := InitFromRd; initFromBuf := InitFromBuf; END; PROCEDURE M3Scanner ToText (t: Default): TEXT = BEGIN RETURN Text.FromChars (SUBARRAY (t.buffer^, t.offset, t.length)); END ToText; PROCEDUREClassName (<*UNUSED*> t: Default; tk: INTEGER): TEXT = BEGIN IF (FIRST (TokenName) <= tk) AND (tk <= LAST (TokenName)) THEN RETURN TokenName [tk]; ELSE RETURN NIL; END; END ClassName; PROCEDUREInitFromRd (t: Default; source : Rd.T; skip_comments : BOOLEAN; split_pragmas : BOOLEAN): T = VAR len0, len1: INTEGER; BEGIN t.skip_comments := skip_comments; t.split_pragmas := split_pragmas; t.line := 1; t.next_line := 1; (* read the source *) TRY len0 := Rd.Length (source); t.buffer := NEW (Buf, MAX (0, len0) + 1); IF (len0 < 0) THEN Err (t, 0, "can't tell how much source there is") END; len1 := Rd.GetSub (source, t.buffer^); IF (len0 # len1) THEN Err (t, 0, "couldn't read the entire source") END; t.buffer [len0] := EOFChar; EXCEPT | Rd.Failure => Err (t, 0, "I/O failure reading the source"); | Thread.Alerted => Err (t, 0, "*alerted*"); END; RETURN t; END InitFromRd; PROCEDUREInitFromBuf (t: Default; buf : Buf; skip_comments : BOOLEAN; split_pragmas : BOOLEAN): T = BEGIN t.skip_comments := skip_comments; t.split_pragmas := split_pragmas; t.line := 1; t.next_line := 1; t.buffer := buf; RETURN t; END InitFromBuf; PROCEDUREErr (t: Default; offset: CARDINAL; msg: TEXT) = BEGIN t.token := TK_Error; t.msg := msg; t.length := offset - t.offset; t.next_offset := offset; END Err; PROCEDUREGetToken (t: Default) = VAR buf := t.buffer; offset := t.next_offset; ch := buf [offset]; BEGIN t.line := t.next_line; t.line_start := t.next_line_start; LOOP (* skip white space *) WHILE (WhiteSpace[ch]) DO INC (offset); IF (ch = '\n') THEN INC (t.line); INC (t.next_line); t.line_start := offset; t.next_line_start := offset; END; ch := buf [offset]; END; (* remember where this token starts *) t.offset := offset; t.start := offset; t.column := offset - t.line_start; CASE ch OF | 'a'..'z', 'A'..'Z' => (* scan an identifier *) WHILE AlphaNumerics[ch] DO INC (offset); ch := buf[offset]; END; t.token := M3Keyword.Classify (SUBARRAY (buf^, t.offset, offset - t.offset)); EXIT; | '0'..'9' => ScanNumber (t); RETURN; | '\'' => ScanChar (t); RETURN; | '\"' => ScanText (t); RETURN; | '+' => t.token := TK_Plus; INC (offset); EXIT; | '-' => t.token := TK_Minus; INC (offset); EXIT; | '/' => t.token := TK_Slash; INC (offset); EXIT; | '&' => t.token := TK_Ampersand; INC (offset); EXIT; | ',' => t.token := TK_Comma; INC (offset); EXIT; | ';' => t.token := TK_Semi; INC (offset); EXIT; | '[' => t.token := TK_L_bracket; INC (offset); EXIT; | '{' => t.token := TK_L_brace; INC (offset); EXIT; | '^' => t.token := TK_Arrow; INC (offset); EXIT; | '#' => t.token := TK_Sharp; INC (offset); EXIT; | ')' => t.token := TK_R_paren; INC (offset); EXIT; | ']' => t.token := TK_R_bracket; INC (offset); EXIT; | '}' => t.token := TK_R_brace; INC (offset); EXIT; | '|' => t.token := TK_Bar; INC (offset); EXIT; | EOFChar => t.token := TK_EOF; EXIT; | '*' => (* '*>' '*' *) INC (offset); ch := buf [offset]; IF (ch = '>') THEN t.token := TK_End_pragma; INC (offset); ELSE t.token := TK_Asterisk; END; EXIT; | '=' => (* '=' '=>' *) INC (offset); ch := buf [offset]; IF (ch = '>') THEN t.token := TK_Implies; INC (offset); ELSE t.token := TK_Equal; END; EXIT; | ':' => (* ':' ':=' *) INC (offset); ch := buf [offset]; IF (ch = '=') THEN t.token := TK_Assign; INC (offset); ELSE t.token := TK_Colon; END; EXIT; | '.' => (* '.' '..' *) INC (offset); ch := buf [offset]; IF (ch = '.') THEN t.token := TK_Dot_dot; INC (offset); ELSE t.token := TK_Dot; END; EXIT; | '(' => (* '('*' '(' *) INC (offset); ch := buf [offset]; IF (ch # '*') THEN t.token := TK_L_paren; EXIT; ELSIF ScanComment (t) THEN RETURN; ELSE offset := t.offset; ch := buf [offset]; END; | '>' => (* '>' '>=' *) INC (offset); ch := buf [offset]; IF (ch = '=') THEN t.token := TK_Gr_equal; INC (offset); ELSE t.token := TK_Greater; END; EXIT; | '<' => (* '<' '<=' '<:' '<*' *) INC (offset); ch := buf [offset]; IF (ch = '=') THEN t.token := TK_Ls_equal; INC (offset); ELSIF (ch = ':') THEN t.token := TK_Subtype; INC (offset); ELSIF (ch # '*') THEN t.token := TK_Less; ELSIF (NOT t.split_pragmas) THEN ScanPragma (t); RETURN; ELSE t.token := TK_Begin_pragma; INC (offset); END; EXIT; ELSE INC (offset); Err (t, offset, "Illegal character: " & Fmt.Int (ORD (ch))); EXIT; END; (*case*) END; (*loop*) (* record the length of the token and where we are *) t.length := offset - t.offset; t.next_offset := offset; END GetToken; PROCEDUREScanNumber (t: Default) = VAR offset := t.offset; ch := t.buffer [offset]; BEGIN (* scan the decimal digits *) WHILE Digits[ch] DO INC (offset); ch := t.buffer [offset]; END; IF (ch = '_') THEN (* scan a based integer *) INC (offset); ch := t.buffer [offset]; IF HexDigits[ch] THEN WHILE HexDigits[ch] DO INC (offset); ch := t.buffer [offset]; END; IF (ch = 'L') OR (ch = 'l') THEN t.token := TK_Long_const; t.length := offset - t.offset; t.next_offset := offset + 1; RETURN; ELSE t.token := TK_Card_const; END; ELSE Err (t, offset, "illegal based integer: missing digits"); RETURN; END; ELSIF (ch = '.') AND (t.buffer [offset+1] = '.') THEN (* we saw "dddd.." *) t.token := TK_Card_const; ELSIF (ch = '.') THEN (* scan a floating point number *) INC (offset); ch := t.buffer [offset]; (* scan the fractional digits *) IF Digits[ch] THEN WHILE Digits[ch] DO INC (offset); ch := t.buffer [offset]; END; ELSE Err (t, offset, "missing digits in real fraction"); RETURN; END; (* check for the exponent *) IF (ch = 'e') OR (ch = 'E') THEN t.token := TK_Real_const; ELSIF (ch = 'd') OR (ch = 'D') THEN t.token := TK_Longreal_const; ELSIF (ch = 'x') OR (ch = 'X') THEN t.token := TK_Extended_const; ELSE (* real constant with no exponent *) t.token := TK_Real_const; t.next_offset := offset; t.length := offset - t.offset; RETURN; END; INC (offset); ch := t.buffer [offset]; (* eat the e/d/x *) (* get the exponent sign *) IF (ch = '+') OR (ch = '-') THEN INC (offset); ch := t.buffer [offset]; END; (* finally, get the exponent digits *) IF Digits[ch] THEN WHILE Digits[ch] DO INC (offset); ch := t.buffer [offset]; END; ELSE Err (t, offset, "missing digits in real exponent"); END; ELSE (* scanned a simple decimal integer *) IF (ch = 'L') OR (ch = 'l') THEN t.token := TK_Long_const; t.length := offset - t.offset; t.next_offset := offset + 1; ELSE t.token := TK_Card_const; END; END; t.length := offset - t.offset; t.next_offset := offset; END ScanNumber; PROCEDUREScanChar (t: Default) = VAR offset : CARDINAL := t.offset+1; ch : CHAR := t.buffer [offset]; BEGIN IF NOT CharAlert [ch] THEN (* a simple character literal *) INC (offset); ch := t.buffer[offset]; ELSIF (ch = '\'') THEN INC (offset); Err (t, offset, "missing character in character literal"); RETURN; ELSIF (ch = '\n') OR (ch = '\r') OR (ch = '\f') THEN Err (t, offset, "end-of-line encountered in character literal"); RETURN; ELSIF (ch = '\\') THEN INC (offset); IF NOT ScanEscape (t, offset) THEN Err (t, offset, "unknown escape sequence in character literal"); RETURN; END; ch := t.buffer [offset]; ELSIF (ch = EOFChar) THEN Err (t, offset, "EOF encountered in character literal"); RETURN; ELSE (* a simple character literal *) INC (offset); ch := t.buffer[offset]; END; IF (ch # '\'') THEN Err (t, offset, "missing closing quote on character literal"); RETURN; END; INC (offset); (* eat the closing quote *) t.token := TK_Char_const; t.length := offset - t.offset; t.next_offset := offset; END ScanChar; PROCEDUREScanText (t: Default) = VAR offset : CARDINAL := t.offset+1; ch : CHAR := t.buffer [offset]; BEGIN LOOP IF NOT CharAlert [ch] THEN INC (offset); ch := t.buffer [offset]; ELSIF (ch = '\"') THEN INC (offset); EXIT; ELSIF (ch = '\n') OR (ch = '\r') OR (ch = '\f') THEN Err (t, offset, "end-of-line encountered in text literal"); RETURN; ELSIF (ch = '\\') THEN INC (offset); IF NOT ScanEscape (t, offset) THEN Err (t, offset, "unknown escape sequence in text literal"); RETURN; END; ch := t.buffer [offset]; ELSIF (ch = EOFChar) THEN Err (t, offset, "EOF encountered in text literal"); RETURN; ELSE (* a simple character *) INC (offset); ch := t.buffer [offset]; END; END; t.token := TK_Text_const; t.length := offset - t.offset; t.next_offset := offset; END ScanText; PROCEDUREScanEscape (t: Default; VAR offset: CARDINAL): BOOLEAN = VAR ch := t.buffer [offset]; BEGIN INC (offset); IF (ch = 'n') OR (ch = 't') OR (ch = 'r') OR (ch = 'f') OR (ch = '\\') OR (ch = '\'') OR (ch = '\"') THEN RETURN TRUE; END; IF NOT OctalDigits [ch] THEN RETURN FALSE END; ch := t.buffer [offset]; INC (offset); IF NOT OctalDigits [ch] THEN RETURN FALSE END; ch := t.buffer [offset]; INC (offset); RETURN OctalDigits [ch]; END ScanEscape; PROCEDUREScanComment (t: Default): BOOLEAN = VAR nest := 1; start := t.offset; offset := start + 2; (* skip the opening '(' and '*' *) ch := t.buffer [offset]; my_line := t.line; l_start := t.line_start; BEGIN WHILE (nest > 0) DO (************** UNSAFE VERSION ********************** VAR p: UNTRACED REF CHAR; p := ADR (t.buffer[offset]); WHILE NOT CommentAlert[ch] DO INC (p); ch := p^; END; offset := p - ADR (t.buffer[0]); ******************************************************) WHILE NOT CommentAlert[ch] DO INC (offset); ch := t.buffer[offset]; END; IF (ch = '*') THEN INC (offset); ch := t.buffer [offset]; IF (ch = ')') THEN INC (offset); ch := t.buffer[offset]; DEC (nest); END; ELSIF (ch = '(') THEN INC (offset); ch := t.buffer [offset]; IF (ch = '*') THEN INC (offset); ch := t.buffer[offset]; INC (nest); END; ELSIF (ch = EOFChar) THEN t.next_line := my_line; t.next_line_start := l_start; Err (t, offset, "EOF encountered in comment"); RETURN TRUE; ELSIF (ch = '\n') THEN INC (offset); ch := t.buffer [offset]; INC (my_line); l_start := offset; ELSE INC (offset); ch := t.buffer [offset]; END; END; IF t.skip_comments THEN t.offset := offset; t.line := my_line; t.line_start := l_start; t.next_offset := offset; t.next_line := my_line; t.next_line_start := l_start; RETURN FALSE; ELSE t.token := TK_Comment; t.length := offset - t.offset; t.next_offset := offset; t.next_line := my_line; t.next_line_start := l_start; RETURN TRUE; END; END ScanComment; PROCEDUREScanPragma (t: Default) = VAR start := t.offset; offset := start + 2; (* skip the opening '<' and '*' *) ch := t.buffer [offset]; my_line := t.line; l_start := t.line_start; BEGIN (* scan the entire pragma as a string *) WHILE (ch # '*') OR (t.buffer[offset+1] # '>') DO IF (ch = EOFChar) THEN Err (t, offset, "EOF encountered in pragma"); RETURN; END; INC (offset); IF (ch = '\n') THEN INC (my_line); l_start := offset; END; ch := t.buffer [offset]; END; INC (offset, 2); t.token := TK_Begin_pragma; t.length := offset - t.offset; t.next_offset := offset; t.next_line := my_line; t.next_line_start := l_start; END ScanPragma; BEGIN WhiteSpace [' '] := TRUE; WhiteSpace ['\n'] := TRUE; WhiteSpace ['\t'] := TRUE; WhiteSpace ['\r'] := TRUE; WhiteSpace ['\f'] := TRUE; AlphaNumerics ['_'] := TRUE; FOR c := 'a' TO 'z' DO AlphaNumerics [c] := TRUE END; FOR c := 'A' TO 'Z' DO AlphaNumerics [c] := TRUE END; FOR c := '0' TO '9' DO AlphaNumerics [c] := TRUE END; FOR c := '0' TO '9' DO Digits [c] := TRUE END; FOR c := '0' TO '7' DO OctalDigits [c] := TRUE END; FOR c := '0' TO '9' DO HexDigits [c] := TRUE END; FOR c := 'a' TO 'f' DO HexDigits [c] := TRUE END; FOR c := 'A' TO 'F' DO HexDigits [c] := TRUE END; CommentAlert ['*'] := TRUE; CommentAlert ['('] := TRUE; CommentAlert [EOFChar] := TRUE; CommentAlert ['\n'] := TRUE; CharAlert ['\''] := TRUE; CharAlert ['\"'] := TRUE; CharAlert ['\n'] := TRUE; CharAlert ['\r'] := TRUE; CharAlert ['\f'] := TRUE; CharAlert ['\\'] := TRUE; CharAlert [EOFChar] := TRUE; END M3Scanner.