(* camlp4r *)

open Token;


(* Dictionaries: tree annotated with string options, each node is a map
   from chars to dictionaries (the subtrees). *)
module CharMap =
  Map.Make (struct type t = char; value compare = compare; end);
type ttree = {node : option string; branch : CharMap.t ttree};

value empty_ttree = {node = None; branch = CharMap.empty};
value ttree_add ttree str =
  let rec insert tt i =
    if i == String.length str then {node = Some str; branch = tt.branch}
    else
      let c = str.[i] in
      let br =
        match try Some (CharMap.find c tt.branch) with [ Not_found -> None ]
        with
        [ Some tt' ->
            CharMap.add c (insert tt' (i + 1)) (CharMap.remove c tt.branch)
        | None ->
            let tt' = {node = None; branch = CharMap.empty} in
            CharMap.add c (insert tt' (i + 1)) tt.branch ]
      in
      {node = tt.node; branch = br}
  in insert ttree 0
;
(* Search a string in a dictionary: raises Not_found
   if the word does not exist. *)
value ttree_find ttree str =
  let rec proc_rec tt i =
    if i == String.length str then
      match tt.node with
      [ Some s -> s
      | None -> raise Not_found ]
    else proc_rec (CharMap.find str.[i] tt.branch) (i+1)
  in proc_rec ttree 0
;



(* Lexer conventions on tokens *)
exception BadToken of string;

value check_special_token str =
  let rec loop =
    parser
    [ [: `' ' | '\n' | '\r' | '\t' :] -> raise (BadToken str)
    | [: _ = Stream.empty :] -> ()
    | [: `_ ; s :] -> loop s ]
  in
  loop (Stream.of_string str)
;
value check_ident str =
  let rec loop =
    parser
    [ [: `'$' | 'a'..'z' | 'A'..'Z' | '\192'..'\214' | '\216'..'\246'
              | '\248'..'\255' | '0'..'9' | ''' | '_'; s :] -> loop s
    | [: _ = Stream.empty :] -> ()
    | [: :] -> raise (BadToken str) ]
  in
  loop (Stream.of_string str)
;

(* Special token dictionary *)
value token_tree = ref empty_ttree;
value add_special_token str =
  do check_special_token str;
  return token_tree.val := ttree_add token_tree.val str
;

(* Keyword identifier dictionary *)
value keywords = ref empty_ttree;
value find_keyword s = ttree_find keywords.val s;
value add_keyword str =
  do check_ident str;
  return keywords.val := ttree_add keywords.val str
;


(* Adding a new token (keyword or special token). *)
value add_token (con, str) =
  match con with
  [ "" ->
      let normal_token =
        if String.length str > 0 then
          match str.[0] with
          [ ' ' | '\n' | '\r' | '\t' | '0'..'9' | '"' (*"*) ->
              raise (BadToken str)
          | '_' | '$' | 'a'..'z' | 'A'..'Z' -> True
          | _ -> False ]
        else True
      in
      if normal_token then add_keyword str else add_special_token str
  | "LIDENT" | "INT" | "STRING" | "EOI" -> ()
  | _ ->
      raise (Token.Error ("\
the constructor \"" ^ con ^ "\" is not recognized by Clexer")) ]
;



(* Freeze and unfreeze the state of the lexer *)
type frozen_t = (ttree * ttree);

value freeze () = (keywords.val, token_tree.val);
value unfreeze (kw,tt) =
  do keywords.val := kw;
     token_tree.val := tt;
  return ()
;
value init () =
  do unfreeze(empty_ttree, empty_ttree);
  return
    List.iter add_keyword
      ["Grammar"; "Syntax"; "Quit"; "Load"; "Compile";
       "of"; "with"; "end"; "as"; "in"; "using";
       "Cases"; "Fixpoint"; "CoFixpoint";
       "Definition"; "Inductive"; "CoInductive"; 
       "Theorem"; "Variable"; "Axiom"; "Parameter"; "Hypothesis";
       "Orelse";
       "Proof"; "Qed";
       "Prop"; "Set"; "Type"
      ] (* "let" is not a keyword because #Core#let.cci would not parse *)
;

init();



(* Errors occuring while lexing (explained as "Lexer error: ...") *)
value err loc str = Stdpp.raise_with_loc loc (Token.Error str);


(* The string buffering machinery *)

value buff = ref (String.create 80);
value store len x =
  do if len >= String.length buff.val then
       buff.val := buff.val ^ String.create (String.length buff.val)
     else ();
     buff.val.[len] := x;
  return succ len
;
value mstore len s =
  add_rec len 0 where rec add_rec len i =
    if i == String.length s then len else add_rec (store len s.[i]) (succ i)
;
value get_buff len = String.sub buff.val 0 len;



(* The classical lexer: idents, numbers, quoted strings, comments *)

value rec ident len =
  parser
  [ [: `('$' | 'a'..'z' | 'A'..'Z' | '\192'..'\214' | '\216'..'\246' 
             |'\248'..'\255' | '0'..'9' | ''' | '_' | '@' as c); s :] ->
      ident (store len c) s
  | [: :] -> len ]
;

value rec number len =
  parser
  [ [: `('0'..'9' as c); s :] -> number (store len c) s
  | [: :] -> len ]
;

value escape len =
  fun
  [ c -> store len c ]
;

value rec string bp len =
  parser
  [ [: `'"' (*"*) :] -> len
  | [: `'\\'; `c; s :] -> string bp (escape len c) s
  | [: _ = Stream.empty :] ep -> err (bp, ep) "string not terminated"
  | [: `c; s :] -> string bp (store len c) s ]
;

value rec comment bp =
  parser
  [ [: `'(';
       _ =
        parser
        [ [: `'*'; _ = comment bp :] -> ()
        | [: :] -> () ]; s :] -> comment bp s
  | [: `'*';
       _ =
        parser
        [ [: `')' :] -> ()
        | [: s :] -> comment bp s ] :] -> ()
  | [: `'"' (*"*); _ = parser bp [: _ = string bp 0 :] -> (); s :] ->
             comment bp s
  | [: _ = Stream.empty :] ep -> err (bp, ep) "comment not terminated"
  | [: `_; s :] -> comment bp s ]
;

(* Parse a special token, using the token_tree *)
value process_chars bp c cs =
  let rec proc_rec tt =
    match
      match Stream.peek cs with
      [ Some c -> try Some (CharMap.find c tt.branch)
                  with [ Not_found -> None ]
      | None -> None ]
    with
    [ Some tt' -> do Stream.junk cs; return proc_rec tt'
    | None -> tt.node ]
  in
  let t =
    try proc_rec (CharMap.find c token_tree.val.branch) with
    [ Not_found -> token_tree.val.node ]
  in
  let ep = Stream.count cs in
  match t with
  [ Some t -> (("", t), (bp, ep))
  | None -> err (bp, ep) "undefined token" ]
;

(* Parse a token in a char stream *)
value rec next_token =
  parser bp
  [ [: `' ' | '\n' | '\r'| '\t'; s :] -> next_token s
  | [: `('$' | '_' | 'a'..'z' | 'A'..'Z' | '\192'..'\214' 
             | '\216'..'\246' | '\248'..'\255' as c);
       len = ident (store 0 c) :] ep ->
      let id = get_buff len in
      (try ("", find_keyword id) with [ Not_found -> ("LIDENT", id) ],
       (bp, ep))
  | [: `('0'..'9' as c); len = number (store 0 c) :] ep ->
         (("INT", get_buff len), (bp, ep))
  | [: `'"' (*"*); len = string bp 0 :] ep ->
              (("STRING", get_buff len), (bp, ep))
  | [: `('(' as c);
        t =
          parser
          [ [: `'*'; _ = comment bp; s :] -> next_token s
          | [: t = process_chars bp c :] -> t ] :] -> t
  | [: `c; t = process_chars bp c :] -> t
  | [: _ = Stream.empty :] -> (("EOI", ""), (bp, bp + 1)) ]
;


(* Location table system for creating tables associating a token count
   to its location in a char stream (the source) *)
value locerr () = invalid_arg "Lexer: location function";
value loct_create () = ref (Array.create 1024 None);
value loct_func loct i =
  match
    if i < 0 || i >= Array.length loct.val then None
    else Array.unsafe_get loct.val i
  with
  [ Some loc -> loc
  | _ -> locerr () ]
;
value loct_add loct i loc =
  do if i >= Array.length loct.val then
       let new_tmax = Array.length loct.val * 2 in
       let new_loct = Array.create new_tmax None in
       do Array.blit loct.val 0 new_loct 0 (Array.length loct.val);
          loct.val := new_loct;
       return ()
     else ();
     loct.val.(i) := Some loc;
  return ()
;

value func cs =
  let loct = loct_create () in
  let ts =
    Stream.from
      (fun i ->
         let (tok, loc) = next_token cs in
         do loct_add loct i loc; return Some tok)
  in
  (ts, loct_func loct)
;

(* Names of tokens, for this lexer, used in Grammar error messages *)
value token_text =
  fun
  [ ("", t) -> "'" ^ t ^ "'"
  | ("LIDENT", "") -> "lowercase identifier"
  | ("LIDENT", t) -> "'" ^ t ^ "'"
  | ("INT", "") -> "integer"
  | ("INT", s) -> "'" ^ s ^ "'"
  | ("STRING", "") -> "string"
  | ("EOI", "") -> "end of input"
  | (con, "") -> con
  | (con, prm) -> con ^ " \"" ^ prm ^ "\"" ]
;

value tparse (p_con, p_prm) =
  if p_prm = "" then
    parser [: `(con, prm) when con = p_con :] -> prm
  else
    parser [: `(con, prm) when con = p_con && prm = p_prm :] -> prm
;

(* $Id: clexer.ml4,v 1.14 1999/06/29 07:47:29 loiseleu Exp $ *)
