m3tools/src/M3Lexer.m3


 Copyright 1996-2000 Critical Mass, Inc. All rights reserved.    
 See file COPYRIGHT-CMASS for details. 

MODULE M3Lexer;

IMPORT Rd, Target, TInt, TFloat, TWord;
IMPORT M3ID, M3Scanner, Text;

FROM M3Scanner IMPORT TK_Comment, TK_Error, TK_EOF,
  TK_Ident, TK_Card_const, TK_Long_const, TK_Real_const, TK_Longreal_const,
  TK_Extended_const, TK_Char_const, TK_Text_const,
  TK_Begin_pragma, TK_End_pragma, TK_Comma;

CONST
  Backslash = '\134';  (* ASCII / Unicode value *)

TYPE
  SUPER = M3Scanner.Default;

REVEAL
  T = T_ BRANDED "M3Lexer.T" OBJECT
    saved        : BOOLEAN := FALSE;
    next_token   : CARDINAL  := TK_Comment;
    next_start   : CARDINAL  := 0;
    next_length  : CARDINAL  := 0;
    next_offset  : CARDINAL  := 0;
    next_line    : CARDINAL  := 0;
    next_column  : CARDINAL  := 0;
    next_msg     : TEXT      := NIL;
  OVERRIDES
    next        := NextToken;
    className   := ClassName;
    initFromRd  := InitFromRd;
    initFromBuf := InitFromBuf;
  END;

TYPE
  Pragma = REF RECORD
    id   : M3ID.T;
    val  : INTEGER;
    next : Pragma;
  END;

VAR
  pragmas: Pragma := NIL;

CONST
  CNames = ARRAY [TK_Inline .. TK_Fatal] OF TEXT {
    "<*INLINE*>", "<*EXTERNAL*>", "<*ASSERT*>", "<*UNUSED*>",
    "<*OBSOLETE*>", "<*TRACE*>", "<* calling convention *>",
    "<*FATAL*>"
  };

PROCEDURE ClassName (t: T;  tk: INTEGER): TEXT =
  BEGIN
    IF (FIRST (CNames) <= tk) AND (tk <= LAST (CNames))
      THEN RETURN CNames [tk];
      ELSE RETURN SUPER.className (t, tk);
    END;
  END ClassName;

PROCEDURE InitFromRd (t: T;  rd: Rd.T): T =
  BEGIN
    pragmas := NIL;
    RETURN SUPER.initFromRd (t, rd,
             skip_comments := TRUE, split_pragmas := TRUE);
  END InitFromRd;

PROCEDURE InitFromBuf (t: T;  buf: Buf): T =
  BEGIN
    pragmas := NIL;
    RETURN SUPER.initFromBuf (t, buf,
             skip_comments := TRUE, split_pragmas := TRUE);
  END InitFromBuf;

PROCEDURE NextToken (t: T) =
  BEGIN
    IF (t.saved) THEN
      t.saved  := FALSE;
      t.token  := t.next_token;
      t.start  := t.next_start;
      t.length := t.next_length;
      t.offset := t.next_offset;
      t.line   := t.next_line;
      t.column := t.next_column;
      t.msg    := t.next_msg;
    ELSE
      REPEAT
        SUPER.next (t);
        CASE t.token OF
        | TK_Ident          => FixID (t);
        | TK_Card_const     => FixInt (t, Target.Integer.bytes);
        | TK_Long_const     => FixInt (t, Target.Longint.bytes);
        | TK_Real_const     => FixFloat (t, Target.Precision.Short);
        | TK_Longreal_const => FixFloat (t, Target.Precision.Long);
        | TK_Extended_const => FixFloat (t, Target.Precision.Extended);
        | TK_Char_const     => FixChar (t);
        | TK_Text_const     => FixText (t);
        | TK_Begin_pragma   => FixPragma (t);
        ELSE (* let the token through... *)
        END;
      UNTIL (t.token # TK_Comment);
    END;
  END NextToken;

PROCEDURE FixID (t: T) =
  BEGIN
    t.id := M3ID.FromStr (SUBARRAY (t.buffer^, t.offset, t.length));
  END FixID;

PROCEDURE FixInt (t: T;  bytes: CARDINAL) =
  VAR break := -1;  base: INTEGER;
  BEGIN
    FOR i := t.offset TO t.offset + t.length - 1 DO
      IF (t.buffer [i] = '_') THEN
        (* we have a based integer... *)
        break := i;
        EXIT;
      END;
    END;

    IF (break < 0) THEN (* scan a simple integer *)
      IF NOT TInt.New (SUBARRAY (t.buffer^, t.offset, t.length), bytes, t.int) THEN
        Err (t, "illegal integer literal");
      END;
    ELSIF NOT TInt.New (SUBARRAY (t.buffer^, t.offset, break - t.offset), bytes, t.int)
       OR NOT TInt.ToInt (t.int, base)
       OR (base < 2) OR (16 < base) THEN
      Err (t, "illegal base for integer literal");
    ELSIF NOT TWord.New (SUBARRAY (t.buffer^, break+1,
                                   t.offset + t.length - break - 1),
                         base, bytes, t.int) THEN
      Err (t, "illegal based integer literal");
    END;
  END FixInt;

PROCEDURE FixFloat (t: T;  prec: Target.Precision) =
  BEGIN
    IF NOT TFloat.New (SUBARRAY (t.buffer^, t.offset, t.length),
                       prec, t.float) THEN
      Err (t, "illegal floating-point literal");
    END;
  END FixFloat;

PROCEDURE FixChar (t: T) =
  VAR ch: CHAR;
  BEGIN
    ch := t.buffer [t.offset+1];
    IF (t.length = 3) THEN (* ok *)
    ELSIF ConvertEscape (t, t.offset+1, t.length-2, ch) THEN (* ok *)
    ELSE Err (t, "illegal character literal");
    END;
    t.char := ORD (ch);
  END FixChar;

PROCEDURE FixText (t: T) =
  VAR
    len  := 0;
    cur  := t.offset + 1;        (* ignore the surrounding quotes *)
    stop := cur + t.length - 2;  (* ignore the surrounding quotes *)
    txt  := "";
    ch   : CHAR;
    buf  : ARRAY [0..255] OF CHAR;
  BEGIN
    WHILE (cur < stop) DO
      ch := t.buffer [cur];
      IF (ch = Backslash) THEN
        IF (cur+1 < stop) AND ConvertEscape (t, cur, 2, ch) THEN
          INC (cur);
        ELSIF (cur+3 < stop) AND ConvertEscape (t, cur, 4, ch) THEN
          INC (cur, 3);
        ELSE
          Err (t, "unrecognized escape sequence in text literal");
          RETURN;
        END;
      END;
      IF (len >= NUMBER (buf)) THEN
        txt := txt & Text.FromChars (buf);
        len := 0;
      END;
      buf [len] := ch;  INC (len);
      INC (cur);
    END;
    IF (len > 0) THEN
      txt := txt & Text.FromChars (SUBARRAY (buf, 0, len));
    END;
    t.text :=txt;
  END FixText;

PROCEDURE ConvertEscape (t: T;  offs, len: INTEGER;  VAR ch: CHAR): BOOLEAN =
  VAR c0, c1, c2: CHAR;
  BEGIN
    IF t.buffer[offs] # Backslash THEN RETURN FALSE; END;

    IF (len = 2) THEN
      c0 := t.buffer [offs+1];
      IF    (c0 = 'n')       THEN ch := '\n';       RETURN TRUE;
      ELSIF (c0 = 't')       THEN ch := '\t';       RETURN TRUE;
      ELSIF (c0 = 'r')       THEN ch := '\r';       RETURN TRUE;
      ELSIF (c0 = 'f')       THEN ch := '\f';       RETURN TRUE;
      ELSIF (c0 = '\'')      THEN ch := '\'';       RETURN TRUE;
      ELSIF (c0 = '\"')      THEN ch := '\"';       RETURN TRUE;
      ELSIF (c0 = Backslash) THEN ch := Backslash;  RETURN TRUE;
      END;

    ELSIF (len = 4) THEN
      c0 := t.buffer [offs+1];
      c1 := t.buffer [offs+2];
      c2 := t.buffer [offs+3];
      IF    ('0' <= c0) AND (c0 <= '7')
        AND ('0' <= c1) AND (c1 <= '7')
        AND ('0' <= c2) AND (c2 <= '7') THEN
        ch := VAL ( (ORD (c0) - ORD ('0')) * 64
                  + (ORD (c1) - ORD ('0')) * 8
                  + (ORD (c2) - ORD ('0')), CHAR);
        RETURN TRUE;
      END;
    END;

    RETURN FALSE;
  END ConvertEscape;

PROCEDURE FixPragma (t: T) =
  VAR
    p: Pragma;
    save_start   : CARDINAL;
    save_length  : CARDINAL;
    save_offset  : CARDINAL;
    save_line    : CARDINAL;
    save_column  : CARDINAL;
  BEGIN
    (* just in case we need to back up, save the current token *)
    <*ASSERT t.token = TK_Begin_pragma *>
    save_start  := t.start;
    save_length := t.length;
    save_offset := t.offset;
    save_line   := t.line;
    save_column := t.column;

    (* get the ID that defines the pragma *)
    SUPER.next (t);

    IF t.token # TK_Ident THEN
      (* oops, it's a badly formed pragma *)
      t.saved       := TRUE;
      t.next_token  := t.token;
      t.next_start  := t.start;
      t.next_length := t.length;
      t.next_offset := t.offset;
      t.next_line   := t.line;
      t.next_column := t.column;
      t.next_msg    := t.msg;

      t.token  := TK_Begin_pragma;
      t.start  := save_start;
      t.length := save_length;
      t.offset := save_offset;
      t.line   := save_line;
      t.column := save_column;
      t.msg    := NIL;
      RETURN;
    END;

    t.id := M3ID.FromStr (SUBARRAY (t.buffer^, t.offset, t.length));

    p := LookUpPragma (t.id);

    IF (p = NIL) AND Target.FindConvention (M3ID.ToText (t.id)) # NIL THEN
      (* it's a legit calling convention on the current target *)
      t.token := TK_CallConv;
      AddPragma (M3ID.ToText (t.id), TK_CallConv);

    ELSIF (p = NIL) THEN
      (* unknown pragma, restore the begin_pragma/id token sequence... *)
      t.saved       := TRUE;
      t.next_token  := t.token;
      t.next_start  := t.start;
      t.next_length := t.length;
      t.next_offset := t.offset;
      t.next_line   := t.line;
      t.next_column := t.column;
      t.next_msg    := t.msg;

      t.token  := TK_Begin_pragma;
      t.start  := save_start;
      t.length := save_length;
      t.offset := save_offset;
      t.line   := save_line;
      t.column := save_column;
      t.msg    := NIL;

    ELSIF (p.val > 0) THEN
      (* it's a recognized pragma to pass through *)
      t.token := p.val;

    ELSIF (p.val = IGNORE_PRAGMA) THEN
      EatPragma (t);

    ELSIF (p.val = LINE_PRAGMA) THEN
      EatPragma (t);  (* until we need something better... *)

    ELSIF (p.val = PRAGMA_PRAGMA) THEN
      SUPER.next (t);  (* PRAGMA *)
      WHILE (t.token = TK_Ident)
        OR ((M3Scanner.First_Keyword <= t.token)
            AND (t.token <= M3Scanner.Last_Keyword)) DO
        FixID (t);
        IF LookUpPragma (t.id) = NIL THEN
          AddPragma (M3ID.ToText (t.id), IGNORE_PRAGMA);
        END;
        SUPER.next (t);  (* Ident *)
        IF (t.token # TK_Comma) THEN EXIT END;
        SUPER.next (t); (* , *)
      END;
      IF (t.token # TK_End_pragma) THEN
        t.token := TK_Error;
        t.msg   := "missing '*>' on <*PRAGMA*> pragma";
      ELSE
        t.token := TK_Comment; (* fetch the next real token in the outer loop *)
      END;

    ELSE <*ASSERT FALSE*>
    END;

  END FixPragma;

PROCEDURE LookUpPragma (id: M3ID.T): Pragma =
  VAR p: Pragma;
  BEGIN
    IF (pragmas = NIL) THEN InitPragmas () END;
    p := pragmas;
    WHILE (p # NIL) AND (p.id # id) DO p := p.next; END;
    RETURN p;
  END LookUpPragma;

PROCEDURE EatPragma (t: T) =
  VAR key := t.id;  line := t.line;  offs := t.offset;  col := t.column;
  BEGIN
    LOOP
      IF (t.token = TK_End_pragma) THEN
        t.token := TK_Comment; (* so the top-level NextToken() cycles *)
        EXIT;
      ELSIF (t.token = TK_Error) THEN
        EXIT;
      ELSIF (t.token = TK_EOF) THEN
        t.token  := TK_Error;
        t.line   := line;
        t.offset := offs;
        t.column := col;
        t.msg := "<*" & M3ID.ToText(key) & "*> pragma is missing its closing '*>'";
        EXIT;
      ELSE
        SUPER.next (t);
      END;
    END;
  END EatPragma;

CONST
  IGNORE_PRAGMA = -1;
  PRAGMA_PRAGMA = -2;
  LINE_PRAGMA   = -3;

PROCEDURE InitPragmas () =
  BEGIN
    AddPragma ("TRACE",     TK_Trace);
    AddPragma ("OBSOLETE",  TK_Obsolete);
    AddPragma ("LINE",      LINE_PRAGMA);
    AddPragma ("PRAGMA",    PRAGMA_PRAGMA);
    AddPragma ("NOWARN",    IGNORE_PRAGMA);
    AddPragma ("INLINE",    TK_Inline);
    AddPragma ("UNUSED",    TK_Unused);
    AddPragma ("FATAL",     TK_Fatal);
    AddPragma ("EXTERNAL",  TK_External);
    AddPragma ("ASSERT",    TK_Assert);
  END InitPragmas;

PROCEDURE AddPragma (txt: TEXT;  val: INTEGER) =
  BEGIN
    pragmas := NEW (Pragma, id := M3ID.Add (txt), val := val, next := pragmas);
  END AddPragma;

PROCEDURE Err (t: T;  msg: TEXT) =
  BEGIN
    t.msg := msg & ": " & t.toText ();
    t.token := TK_Error;
  END Err;

BEGIN
END M3Lexer.

interface M3ID is in:


interface M3Scanner is in: