open Printf
open Html
open Dtd

(* Wrapped up lexer to insert open/close tags in the stream of "normal"
   tokens, according to some DTD, in order to always get fully parenthesized
   streams *)

type minimization =
  Legal | Illegal of string

let debug = ref false

exception CantMinimize			            (* bogus HTML *)

(* initial element of the DTD *)
let initial = Elements.add "html" Elements.empty

(* open minimize 
   [ominimize dtd open_tag current_stack]
   returns a list of inferred open/close tags and the new stack
 *)
let ominimize dtd t stack =
  let elem = t.tag_name in

  (* Is elem allowed for the given stack ? *)
  let goodpos = function
      [] -> Elements.mem elem initial
    | (_, cts)::l -> Elements.mem elem cts

  (* Return with inferred and stack.
     The stack has been reduced during the inference, so it is enough
     to push the opened element *)
  (* Special hack when t is fake #pcdata... *)
  and return inferred stack =
    if elem = "#pcdata" then
      List.rev inferred, stack
    else
      List.rev ((OpenTag t) :: inferred),
      (elem, Hashtbl.find dtd.contents elem) :: stack
      
  in
  (* [attempt_close mods_so_far current_stack] *)
  let rec attempt_close accu = function
     [] -> (* reached all the possible closing, attempt to open again *)
        attempt_open accu []
   | ((last, _)::l) as stack ->
       if Elements.mem last dtd.close_omitted then
          (* we can attempt to close the previous element *)
	  if goodpos l then 
	    (* good position, we're done *)
	    return ((CloseTag last) :: accu) l
          else (* attempt to open in this new position *)
	    try 
              attempt_open ((CloseTag last) :: accu) l
            with
	      CantMinimize -> (* try once more to close *)
	         attempt_close ((CloseTag last)::accu) l
       else begin (* since we can't close, try to open *)
	  attempt_open accu stack
       end

   (* [attempt_open mods_so_far currentstack] *)
   and attempt_open accu = function
     [] -> 
       (* open HTML, and retry from there *)
       (* should actually iterate on all elements in initial *)
       let newstack = ["html", Hashtbl.find dtd.contents "html"]
       and newaccu = (OpenTag {tag_name = "html"; attributes = []}) :: accu
       in
	  if goodpos newstack then return newaccu newstack
          else attempt_open newaccu newstack

   | ((_, cts)::l ) as stack ->
       (* check if, in contents, there is an element with implicit omission
          that would help *)
       let possible = Elements.inter cts dtd.open_omitted in
        match Elements.cardinal possible with
	  0 -> (* argh *) raise CantMinimize
        | 1 -> 
	  (* open this element and try from there *)
	  let newelem = Elements.choose possible in
	  let newaccu = (OpenTag {tag_name = newelem; attributes = []})::accu
          and newstack = (newelem, Hashtbl.find dtd.contents newelem)::stack
          in
	    if goodpos newstack 
	    then return newaccu newstack
	    else attempt_open newaccu newstack (* maybe more ? *)
        | n -> (* since we have the choice, examine all possibilities *)
	   let elems = Elements.elements possible in
	   let rec backtrack = function 
             [] -> raise CantMinimize
	    | x::l -> 
		try
		  let newaccu = (OpenTag {tag_name = x; attributes = []})::accu
		  and newstack = (x, Hashtbl.find dtd.contents x)::stack
                  in
		    if goodpos newstack then return newaccu newstack 
		    else attempt_open newaccu newstack
		with
		 CantMinimize -> backtrack l
           in 
	   backtrack elems
  in
   (* now do some error recovery *)   
   try Legal, attempt_close [] stack
   with
     CantMinimize ->
       (* what the hell, dammit, open it anyway, who cares, duh *)
       let current = match stack with (x,_)::l -> x | [] -> "" in
       Illegal (sprintf "illegal <%s> in <%s>, keep it though" 
		        t.tag_name current),
       return [] stack

(* close minimize
   [cminimize dtd elem current_stack]
   returns a list of inferred open/close tags and the new stack
 *)
let cminimize dtd tagname stack =
  (* Is elem allowed for the given stack ? *)
  let goodpos = function
      [] -> false
    | (elem, cts)::l -> tagname = elem

  and return inferred stack =
     List.rev ((CloseTag tagname) :: inferred), stack

  in
  (* [attempt_close mods_so_far current_stack] *)
  let rec attempt_close accu = function
     [] -> raise CantMinimize
   | ((last, _)::l) as stack ->
       if Elements.mem last dtd.close_omitted then
          (* we can attempt to close the previous element *)
	  if goodpos l then 
	    (* good position, we're done *)
	    return (CloseTag last :: accu) (List.tl l)
          else (* close a bit more ? *)
	    attempt_close ((CloseTag last)::accu) l
       else 
	 (* there's no reason we should have to open a new element in order
	    to close the current one, is it ? *)
          raise CantMinimize
  in
  (* error recovery strategy *)
  let rec attempt_matching accu = function
     [] -> raise Not_found (* didn't find a matching open at all ! *)
   | (curelem,_):: l when curelem = tagname ->
	 (* so, consider we match this open, and close them all *)
	 return accu l
   | (curelem,_):: l  -> (* otherwise, find something up there *)
	 attempt_matching (CloseTag curelem :: accu) l
   in
   (* now do some error recovery *)   
   try Legal, attempt_close [] stack
   with
     CantMinimize ->
       try
	 Illegal (sprintf "unmatched </%s>, close closest match" tagname),
         attempt_matching [] stack 
       with
	 Not_found -> 
	   Illegal (sprintf "unmatched </%s>, skipped" tagname),
           ([], stack) (* just skip the damn thing *)

let is_cdata cts =
     Elements.cardinal cts = 1 
  && Elements.mem "#cdata" cts

let sgml_lexer dtd =
  let current_lex = ref Lexhtml.html
  and stack = ref [] 
  and lexdata = Lexhtml.new_data ()
  in

  (* currently allowed elements *)
  let allowed () = match !stack with
      [] -> initial
    | (elem, cts)::_ -> cts 
  in
  (* whatever the situation (but close), if the previous element is empty
     with an omittable close, close it *)
  let close_empty () = match !stack with
      [] -> []
    | (elem, ctx)::l ->
	if Elements.is_empty ctx && Elements.mem elem dtd.close_omitted
	then (stack := l; [CloseTag elem])
	else []
  in  
  (fun lexbuf ->
     let token, loc = !current_lex lexbuf lexdata in
      if !debug then begin printf "got "; Html.print token end;
      begin match token with
        OpenTag t ->
          begin try (* first check that we know this element *)
	    let contents = Hashtbl.find dtd.contents t.tag_name in
            let extraclose = close_empty() in    
	    (* check changing of lexers; this works only if error recovery
	       rules imply that the tag will *always* be open
             *)
	    if is_cdata contents then current_lex := Lexhtml.cdata
	    else current_lex := Lexhtml.html;
	    (* is it allowed in here ? *)
	    if Elements.mem t.tag_name (allowed()) then begin
              (* push on the stack *)
              stack := (t.tag_name, contents) :: !stack;
	      Legal, extraclose @ [token], loc
              end
            else begin (* minimisation or error *)
              let flag, (res, l) = ominimize dtd t !stack in
		stack := l;
		flag, extraclose @ res, loc
              end
          with
	    Not_found -> 
	      (* Not in the DTD ! We return it, but don't change our state
		 or stack. An applet extension to the HTML display machine
                 can attempt to do something with it *)
	      Illegal (sprintf "Element %s not in DTD" t.tag_name),
	      [token],
	      loc
         end

      | CloseTag t ->
	  begin try (* do we know this element *)
            let _ = Hashtbl.find dtd.contents t in
	    match !stack with
	      [] -> 
		Illegal(sprintf "Unmatched closing </%s>" t),
		[],
		loc
           | (elem, cts)::l when elem = t -> (* matching close *)
	        stack := l; (* pop the stack *)
                (* the lexer has to be "normal" again, because CDATA
                   can't be nested anyway *)
      	        current_lex := Lexhtml.html;
		Legal, [token], loc
           | (elem, cts)::l -> (* unmatched close ! *)
      	        current_lex := Lexhtml.html;
		let flag, (res, l) = cminimize dtd t !stack in
		  stack := l;
		  flag, res, loc
          with
	    Not_found ->
	      Illegal (sprintf "Element %s not in DTD" t),
	      [token],
	      loc
          end

      | PCData s ->
          let extraclose = close_empty() in    
	  (* is it allowed in here ? *)
	  if Elements.mem "#pcdata" (allowed()) then
              Legal, extraclose @ [token], loc
          (* ignore PCData made of spaces if not relevant to the context *)
	  else if issp s then Legal, extraclose, loc
	  else
           begin	    
          (* bad hack. make believe that we try to open the #pcdata element *)
	    let flag, (res, l) = 
	      ominimize dtd {tag_name = "#pcdata"; attributes = []} !stack in
	      stack := l;
	      flag,  extraclose @ res @ [token], loc
	   end

      (* CData never happens with an empty stack *)
      | CData s ->
          let extraclose = close_empty() in    
	  if Elements.mem "#cdata" (allowed()) then
	    Legal, extraclose @ [token], loc
          else
	    Illegal(sprintf "Unexpected CDATA"),extraclose @ [token], loc
	    
      (* See if the stack is empty *)
      | EOF ->
	  begin match !stack with
	    [] -> Legal, [EOF], loc
          | l -> 
	      let flag, (res, l) = cminimize dtd "html" !stack in
		  stack := l;
		  flag, res @ [EOF], loc
          end

      | _ ->  Legal, [token], loc (* ignore all other cases *)
      end)


let automat dtd action lexbuf error =
  try
    let lexer = sgml_lexer dtd in
    while true do
      try 
      	let correct, tokens, loc = lexer lexbuf in
	begin match correct with
	  Legal -> ()
        | Illegal reason -> error loc reason
        end;
	List.iter 
	  (function token -> 
            begin 
	      try action loc token
              with Invalid_Html s -> error loc s
            end;
	    if token = EOF then failwith "quit_html_eval")
	   tokens
      with
        Html_Lexing (s,n) -> error (Loc(n,n+1)) s
    done
  with
    Failure "quit_html_eval" -> ()
