(* $Id$ -*- tuareg -*-
 * ----------------------------------------------------------------------
 * PXP: The polymorphic XML parser for Objective Caml.
 * Copyright by Gerd Stolpmann. See LICENSE for details.
 *)

open Parsing
open Pxp_types
open Pxp_lexer_types
open Pxp_dtd
open Pxp_entity
open Pxp_document
open Pxp_aux

(* Some types from the interface definition: *)

exception ID_not_unique

class type [ 'ext ] index =
object 
  constraint 'ext = 'ext node #extension
  method add : string -> 'ext node -> unit
  method find : string -> 'ext node
end


type config =
    { warner : collect_warnings;
      errors_with_line_numbers : bool;
      enable_pinstr_nodes : bool;
      enable_super_root_node : bool;
      enable_comment_nodes : bool;
      encoding : rep_encoding;
      recognize_standalone_declaration : bool;
      store_element_positions : bool;
      idref_pass : bool;
      validate_by_dfa : bool;
      accept_only_deterministic_models : bool;
      debugging_mode : bool;
    }

type source =
    Entity of ((dtd -> Pxp_entity.entity) * Pxp_reader.resolver)
  | ExtID of (ext_id * Pxp_reader.resolver)


type start_symbol =
    Ext_document
  | Ext_declarations
  | Ext_element


type context =
    { mutable current : unit -> token;  (* get the current token *)
      mutable get_next : unit -> token; (* go on to the next token; return it *)
      mutable current_token : token;    (* This is the current token *)
      mutable manager : entity_manager; (* The entity manager *)
    }


let make_context entity_manager =
  let c =
    { current = (fun _ -> assert false);
      get_next = (fun _ -> assert false);
      current_token = Eof;
      manager = entity_manager;
    }
  in
  (* Note that the function which is stored in get_next_ref can be changed
   * as a side-effect when an entity is opened or closed. The function in
   * c.get_next must be programmed such that always the current "get_next"
   * function is executed.
   *)
  let get_next_ref = entity_manager # yy_get_next_ref in
  c.current  <- (fun () -> c.current_token);
  c.get_next <- (fun () -> let tok = !get_next_ref() in
		           c.current_token <- tok;
		           tok);
  ignore(c.get_next());
  c
;;


let from_channel ?system_encoding ?id:init_id ?fixenc ch =

  (* Reading from a channel works by modifying the algorithm of
   * resolve_as_file.
   *)

  let url_syntax =      (* A syntax suitable for "file" URLs *)
    { Neturl.null_url_syntax with
	Neturl.url_enable_scheme = Neturl.Url_part_allowed;
	Neturl.url_enable_host   = Neturl.Url_part_allowed;
	Neturl.url_enable_path   = Neturl.Url_part_required;
	Neturl.url_accepts_8bits = true;
    } 
  in

  let an_url =
    Neturl.make_url
      ~scheme: "file"
      ~host:   ""
      ~path:   [ "" ]
      url_syntax
  in

  let init_channel_done = ref false in
    (* Whether the first access to this source has already happened. *)

  (* The task of url_of_id is:
   * - When it is called the first time, and no init_id is present,
   *   the URL file:/// is passed back (an_url). This forces that
   *   absolute path names /path/dir/... will be interpreted as 
   *   file path names. (But relative path names will not work.)
   * - If an init_id has been passed, we can assume that the opened URL
   *   is exactly this init_id. By raising Not_competent it is indicated
   *   that the standard method is to be used for the interpretation of
   *   the URL.
   * - Otherwise, the channel is already being read, and thus cannot again
   *   opened. (This case is handled in channel_of_url.)
   *)

  let url_of_id xid =
    if !init_channel_done then begin
      (* Use the normal way of determining the URL of the ID: *)
      raise Pxp_reader.Not_competent
    end
    else begin
      match init_id with
	  None -> 
	    an_url
	      (* If the channel is not associated with any URL: Simply pass 
	       * the URL file:/// back. 
	       *)
	| Some the_init_id ->
	    assert (the_init_id = xid);
	    raise Pxp_reader.Not_competent
	      (* If the channel is associated with a URL, the corresponding
	       * ID must be passed when the first invocation happens.
	       *)
    end
  in

  (* The task of channel_of_url:
   * - If it is called the first time ("else"), the channel is returned
   * - Otherwise, the channel is already being read, and thus cannot again
   *   opened. By raising Not_competent it is signaled that the 
   *   resolve_as_file object must not continue to open the URL.
   *)

  let channel_of_url url =
    if !init_channel_done then
      raise Pxp_reader.Not_competent
    else begin
      init_channel_done := true;
      ch, fixenc
    end
  in

  let r =
    new Pxp_reader.resolve_as_file 
      ?system_encoding:system_encoding
      ~url_of_id:url_of_id
      ~channel_of_url:channel_of_url
      ()
  in

  let init_xid =
    match init_id with
	None   -> Anonymous
      | Some id -> 
	  (* Note: 'id' may be illegal (malformed); in this case, the first
	   * invocation of url_of_id will raise Not_competent, and the 'open_in'
	   * method will fail.
	   *)
	  id
  in

  ExtID(init_xid, r)
;;


let from_file ?system_encoding utf8_filename =
  
  let r =
    new Pxp_reader.resolve_as_file 
      ?system_encoding:system_encoding
      ()
  in

  let utf8_abs_filename =
    if utf8_filename <> "" && utf8_filename.[0] = '/' then
      utf8_filename
    else
      Sys.getcwd() ^ "/" ^ utf8_filename
  in

  let syntax = { Neturl.ip_url_syntax with Neturl.url_accepts_8bits = true } in
  let url = Neturl.make_url 
	      ~scheme:"file" 
	      ~host:"localhost" 
	      ~path:(Neturl.split_path utf8_abs_filename) 
	      syntax
  in

  let xid = System (Neturl.string_of_url url) in
    

  ExtID(xid, r)
;;


let from_string ?fixenc s =
  let r =
    new Pxp_reader.resolve_read_this_string ?fixenc:fixenc s in
  ExtID(Anonymous, r)
;;


(**********************************************************************)

class ['ext] parser_object
  init_doc init_dtd init_extend_dtd init_config init_resolver init_spec 
  init_process_xmldecl transform_dtd id_index
  =
  object (self)

      (* Note that the 'ext parameter has been the motivation to make the
       * parser a class.
       *)

    val mutable dtd = init_dtd
	(* The DTD being parsed; or the DTD currently assumed *)

    val extend_dtd = init_extend_dtd
	(* Whether the DTD should be extended by ELEMENT, ATTLIST, and
	 * NOTATION declarations or not. (True for validating mode,
	 * false for well-formedness mode.)
	 *)

    val transform_dtd = transform_dtd
        (* A function transforming the DTD *)

    val id_index = (id_index : 'ext index option)
        (* The ID index or None *)

    val process_xmldecl = init_process_xmldecl
        (* Whether the XML declaration is parsed and the found XML version
	 * and standalone declaration are passed to 'doc'.
	 *)

    val lexerset = Pxp_lexers.get_lexer_set (init_config.encoding)

    val doc = init_doc
        (* The current document *)

    method doc = (doc : 'ext document)

    val resolver = init_resolver
        (* The resolver for external IDs *)

    val config = init_config
        (* The current configuration *)

    val elstack = (Stack.create() : ('ext node * entity_id) Stack.t)
       (* The element stack containing all open elements, i.e. elements that
	* have begun by a start tag but that have not been finished (end tag).
	* If the parser sees a start tag, it creates the element and pushes it
	* on top of this stack. If the parser recognizes an end tag, it pulls
	* one element from the stack and checks if it has the same name as
	* given with the end tag.
	*
	* At initialization time, a special element is pushed on the stack,
	* the so-called super root. It is always the bottommost
	* element of the stack, and serves as a guard.
	* [See "initializer" below.]
	*)
		    
    method current =
        (* Get the top element of the element stack *)
        try
          fst(Stack.top elstack)
	with
	    Stack.Empty -> assert false
		(* Not possible, because the super root is always the element 
		 * at the bottom of the stack.
		 *)

    val mutable n_tags_open = 0
	(* Number of begin tags that have been parsed and whose corresponding
	 * end tags have not yet been parsed
	 *)

    val mutable p_internal_subset = false
        (* true while parsing the internal subset - there are some additional
	 * constraints for internal subsets, and because of this it must
	 * be known whether the current declaration is contained in the
	 * internal or external subset of the DTD.
	 *)

    val mutable root = None
        (* Contains the root element (topmost element) while it is being parsed
	 * and after it has been parsed.
	 * This variable is None before the root element is seen.
	 *)

    method root = root

    val spec = init_spec
        (* A hashtable that contains exemplar objects for the various element
	 * types. If an element is parsed, the exemplar is looked up and
	 * "cloned" (by the "create" method)
	 *)

    val mutable current_data = []
	(* Collects character data. *)

    method collect_data s =
        (* Collects the character material 's' *)
        current_data <- s :: current_data

    method save_data =
      (* Puts the material collected in 'current_data' into a new
       * node, and appends this node as new sub node to 'current'
       *)
      match current_data with
	  [] ->
	    ()
	| [ str ] ->
	    if str <> "" then
	      self # current # add_node (create_data_node spec dtd str);
	    current_data <- []
	| _ ->
	    let count = List.fold_left 
			  (fun acc s -> acc + String.length s) 
			  0
			  current_data in
	    let str = String.create count in
	    let pos = ref count in
	    List.iter
	      (fun s ->
		 let l = String.length s in
		 pos := !pos - l;
		 String.blit
		 ~src:s
		 ~src_pos:0
		 ~dst:str
		 ~dst_pos:(!pos)
		 ~len:l
	      )
	      current_data;
	    assert(!pos = 0);
	    if str <> "" then
	      self # current # add_node (create_data_node spec dtd str);
	    current_data <- []
	      

    method only_whitespace data =
	(* Checks that the string "data" contains only whitespace. On failure,
	 * Validation_error is raised.
	 *)
      let lexbuf = Lexing.from_string data in
      let t1 = lexerset.scan_name_string lexbuf in
      if t1 <> Ignore then
	raise(WF_error("Data not allowed here"));
      let t2 = lexerset.scan_name_string lexbuf in
      if t2 <> Eof then
	raise(WF_error("Data not allowed here"));
      ()

    initializer
      (* CHECKS: *)
      if config.encoding <> dtd # encoding then
	failwith("Encoding mismatch");

      (* --- Initialize 'elstack': Push the super-root on the stack. *)
      let super_root = 
	if config.enable_super_root_node then
	  create_super_root_node spec dtd 
	else
	  (* because spec may not contain an exemplar for the super root: *)
	  create_no_node spec dtd
      in
      (* Move the super root or the emulation to the stack: *)
      Stack.push (super_root, (self :> entity_id)) elstack;


      (********* Here the method "parse" begins. The grammar below is
       *         transformed to a local function of this method
       *)

      method parse context start_symbol =

	let parse_ignored_section yy_current yy_get_next =
	  (* A special parser which should be used after <![IGNORE[.
	   * It parses until the corresponding ]]> is found.
	   *)

          while yy_current() = Ignore do
	    ignore(yy_get_next());
	  done;

	  ( match yy_current() with
	      Conditional_body _ -> ()
	    | _                  -> raise Parsing.Parse_error;
	  );

	  let en = context.manager # current_entity in
	  let llev = ref 1 in
	  while !llev >= 1 do
	    let igntok = en # next_ignored_token in
	    (* next_ignored_token: uses a special lexer that only
	     * recognizes Conditional_begin and Conditional_end;
	     * other character combinations are ignored.
	     *)
	    (* NOTE: next_ignored_token works much like yy_get_next,
	     * but it does not set the current token!
	     *)
	    match igntok with
		Conditional_begin _ ->
		  llev := !llev + 1
	      | Conditional_end _ ->
		  llev := !llev - 1;
		  (* Because the loop may be exited now: *)
		  context.current_token <- igntok;
	      | (End_entity | Eof) ->
		  raise Parsing.Parse_error
	      | _ ->
		  ()
	  done;
	  
	in


	let check_and_parse_xmldecl xmldecl =
	  if process_xmldecl then begin
	    let v, _, s = decode_doc_xml_pi (decode_xml_pi xmldecl) in
	    check_version_num v;
	    doc # init_xml_version v;
	    let v = match s with
		None -> false
	      | Some "yes" -> true
	      | Some "no" -> false
	      | _ -> raise (WF_error("Illegal 'standalone' declaration"))
	    in
	    if config.recognize_standalone_declaration then 
	      dtd # set_standalone_declaration v
	  end
	in

	let recode_utf8 s =
	  (* Recode 's' to UTF-8 *)
	  if config.encoding = `Enc_utf8 then
	    s   (* No recoding necessary *)
	  else
	    Netconversion.recode_string 
	      ~in_enc:(config.encoding :> encoding) ~out_enc:`Enc_utf8 s
	in

	
%%

/* The following grammar looks similar to ocamlyacc grammars, but 
 * ocamlyacc is actually not used to transform the grammar into a parser. 
 * Instead, the parser generator m2parsergen is applied.
 *
 * The format of the grammar is different (see m2parsergen/README), 
 * but I hope that you can understand most features immediately. 
 *
 * The type of the parser is different: m2parsergen creates a top-down
 * parser while ocamlyacc generates a LALR-1 parser.
 *
 * The way the generated code is called is different: ocamlyacc produces
 * lots of top-level definitions whereas m2parsergen generates only 
 * a local let-in-phrase. This is explained in the already mentioned
 * README file.
 */ 

/* See Pxp_types.ml for comments to the various tokens */

%token Begin_entity
%token End_entity
%token Comment_begin
%token Comment_end
%token Ignore
%token Eq
%token Rangle
%token Rangle_empty
%token <> Conditional_begin
%token <> Conditional_body
%token <> Conditional_end
%token Percent
%token Plus
%token Star
%token Bar
%token Comma
%token Qmark
%token Pcdata
%token Required
%token Implied
%token Fixed
%token Eof

%token <> Comment_material
%token <> Doctype
%token <> Doctype_rangle
%token <> Dtd_begin
%token <> Dtd_end
%token <> Decl_element
%token <> Decl_attlist
%token <> Decl_entity
%token <> Decl_notation
%token <> Decl_rangle
%token <> Lparen
%token <> Rparen
%token <> RparenPlus
%token <> RparenStar
%token <> RparenQmark

%token <> Tag_beg
%token <> Tag_end

%token <> PI
%token <> PI_xml
%token <> Cdata
%token <> CRef
%token <> ERef
%token <> PERef
%token <> CharData
%token <> LineEnd
%token <> Name
%token <> Nametoken
%token <> Attval
%token <> Attval_nl_normalized
%token <> Unparsed_string

/* START SYMBOLS:
 *
 * "ext_document":       parses a complete XML document (i.e. containing a
 *                       <!DOCTYPE..> and an element)
 * "ext_declarations":   parses an "external DTD subset", i.e. a sequence
 *                       of declarations
 * "ext_element":        parses a single element (no <!DOCTYPE...> allowed);
 *                       the element needs not to be the root element of the
 *                       DTD
 *
 * The functions corresponding to these symbols return always () because
 * they only have side-effects.
 */

/* SOME GENERAL COMMENTS:
 *
 * The parser does not get its tokens from the lexers directly. Instead of
 * this, there is an entity object between the parser and the lexers. This
 * object already handles:
 *
 * - References to general and parameter entities. The token stream is
 *   modified such that tokens automatically come from the referenced entities.
 *   External parameter entities and all general entities are embraced by
 *   the two special tokens Begin_entity and End_entity. The parser must
 *   check that these braces are correctly nested.
 */

%%


ext_document():
  Begin_entity 
  doc_xmldecl_then_misc_then_prolog_then_rest() End_entity
    {{
      if n_tags_open <> 0 then
	raise(WF_error("Missing end tag"))
    }}


/* In the following rule, we must find out whether there is an XML declaration
 * or not, and directly after that either "process_xmldecl" or 
 * "process_missing_xmldecl" of the current entity must be called.
 * AND IT MUST BE DIRECTLY! Because of this, the invocation is carried out
 * in the "$" clause immediately following the first token.
 *
 * TODO: This is not enough. The first token may be a tag, and the tag
 * may already contain non-ASCII characters. (But in this case, the resolvers
 * assume UTF8, and they are right...)
 */

doc_xmldecl_then_misc_then_prolog_then_rest():
  pl:PI_xml 
  $ {{ context.manager # current_entity # process_xmldecl pl; 
       check_and_parse_xmldecl pl;
    }}
  misc()* doc_prolog_then_rest()
    {{ () }}

| $ {{ context.manager # current_entity # process_missing_xmldecl; }}
  misc() misc()* doc_prolog_then_rest()
    {{ () }}

| $ {{ context.manager # current_entity # process_missing_xmldecl; }}
  doctypedecl() misc()* contents_start()
    {{ () }}

| $ {{ context.manager # current_entity # process_missing_xmldecl; }}
  contents_start()
    {{ () }}


doc_prolog_then_rest():
  doctypedecl() misc()* contents_start()
    {{ () }}
| contents_start() 
    {{ () }}
 

ext_element():
  Begin_entity el_xmldecl_then_misc_then_rest() End_entity
    {{
      if n_tags_open <> 0 then
	raise(WF_error("Missing end tag"))
    }}


/* See comment for doc_mldecl_then_misc_then_prolog_then_rest. */

el_xmldecl_then_misc_then_rest():
  pl:PI_xml
  $ {{ context.manager # current_entity # process_xmldecl pl; }}
  misc()* contents_start() 
    {{ () }}

| $ {{ context.manager # current_entity # process_missing_xmldecl; }}
  misc() misc()* contents_start() 
    {{ () }}

| $ {{ context.manager # current_entity # process_missing_xmldecl; }}
  contents_start()
    {{ () }}


ext_declarations():
  /* Parses a sequence of declarations given by an entity. As side-effect,
   * the parsed declarations are put into the dtd object.
   */
  Begin_entity decl_xmldecl_then_rest() 
   {{ () }}
| Eof
   {{ () }}


decl_xmldecl_then_rest():
  /* Note: This rule is also called from declaration()! */
  pl:PI_xml
  $ {{ context.manager # current_entity # process_xmldecl pl; 
    }}
  declaration()* End_entity
   {{ () }}

| $ {{ context.manager # current_entity # process_missing_xmldecl; }}
  declaration() declaration()* End_entity
   {{ () }}

| $ {{ context.manager # current_entity # process_missing_xmldecl; }}
  End_entity
   {{ () }}


misc():
  pi()
    {{ () }}
| data: CharData
    /* In this context, the lexers sometimes do not recognize white space; 
     * instead CharData tokens containing white space are delivered.
     */
    {{ self # only_whitespace data }}
| Ignore
    {{ () }}
| comment()
    {{ () }}


/********************* DOCUMENT TYPE DECLARATION *************************/

doctypedecl():
  /* parses from <!DOCTYPE to >. As side-effect, first the declarations of
   * the internal DTD (if any) are put into !!on_dtd, then the declarations
   * of the external DTD (if any) are put into this DTD object.
   */
  doctype_entid:  Doctype 
             ws:  Ignore Ignore*
                  doctypedecl_material (doctype_entid)
    {{ () }}
  ? {{ match !yy_position with
	   "ws" -> raise(WF_error("Whitespace is missing after `DOCTYPE'"))
	 | _    -> raise(WF_error("Bad DOCTYPE declaration"))
     }}


/* TRICK: 
 *   ws: Ignore? Ignore* 
 * is meant seriously. The effect is that ws becomes a boolean variable
 * which is true if there is an Ignore token and false otherwise.
 * This construct is faster than just 
 *   ws: Ignore*
 * in which case ws becomes an integer variable containing the number of
 * Ignore tokens. Counting the number of tokens is slower than only checking
 * the existence.
 *
 * We need the information whether there is an Ignore token (representing
 * white space), because white space is only obligatory if also an identifier
 * for the external subset is parsed; this conditional syntax constraint is 
 * simply programmed in the body of the grammar rule.
 */

doctypedecl_material(doctype_entid):
  root_name:             Name
  ws:                    Ignore? Ignore*
  external_subset:       external_id()? 
                         Ignore*
  internal_subset:       internal_dtd()? 
                         Ignore*
  doctype_rangle_entid:  Doctype_rangle
    {{ 
      if doctype_entid != doctype_rangle_entid then
	raise (Validation_error("Entities not properly nested with DOCTYPE declaration"));
      dtd # set_root root_name;
      begin match external_subset, internal_subset with
	  None, None      -> ()         (* no DTD means no ID *)
	| None, Some _    -> dtd # set_id Internal
	| Some id, None   -> dtd # set_id (External id)
	| Some id, Some _ -> dtd # set_id (Derived id)
      end;
      (* Get now the external doctype declaration. Note that the internal
       * subset has precedence and must be read first.
       *)
      begin match external_subset with
	  None -> ()
	| Some id ->
	    if not ws then
	      raise(WF_error("Whitespace is missing after `DOCTYPE " ^ 
			     root_name ^ "'"));
	    let r' = resolver # clone in
	    let pobj =
	      new parser_object
		(new document config.warner)
		dtd
		extend_dtd
		config
		r'
		spec
		process_xmldecl
		(fun x -> x)
		None
	    in
	    let en = new external_entity r' dtd "[dtd]"
		         config.warner id false config.errors_with_line_numbers
		         config.encoding
	    in
	    en # set_debugging_mode (config.debugging_mode);
	    let mgr = new entity_manager en in
	    en # open_entity true Declaration;
	    try
	      let context = make_context mgr in
	      pobj # parse context Ext_declarations;
	      ignore(en # close_entity);
	    with
		error ->
		  ignore(en # close_entity);
		  r' # close_all;
		  let pos = mgr # position_string in
		  raise (At(pos, error))
      end;
      dtd # validate
    }}
  ? {{
       match !yy_position with
	   "doctype_rangle_entid" -> raise(WF_error("`>' expected"))
	 | _                      -> raise(WF_error("Bad DOCTYPE declaration"))
    }}

/* Note that there are no keywords for SYSTEM or PUBLIC, as these would
 * be difficult to recognize in the lexical contexts. Because of this, 
 * SYSTEM/PUBLIC is parsed as name, and the rule for everything after
 * SYSTEM/PUBLIC is computed dynamically.
 */

external_id():
  tok:Name 
  $ {{ 
       let followup = 
	 match tok with
	     "SYSTEM" -> parse_system_id
                           (* Apply the rule system_id (below) to parse the
			    * rest of the ID 
			    *)
	   | "PUBLIC" -> parse_public_id
                           (* Apply the rule public_id (below) to parse the
			    * rest of the ID 
			    *)
	   | _        -> raise(WF_error("SYSTEM or PUBLIC expected"))
       in
     }}
  ws:Ignore Ignore*
  r:[followup]()
    {{ r }}
  ? {{ match !yy_position with
	   "ws" -> raise(WF_error("Whitespace is missing after " ^ tok))
	 | _    -> raise(WF_error("Bad SYSTEM or PUBLIC identifier"))
    }}


system_id():
  str:Unparsed_string 
    {{ System (recode_utf8 str) }}


public_id():
  str1: Unparsed_string 
    ws: Ignore Ignore*
  str2: Unparsed_string
    {{ check_public_id str1;
       Public(recode_utf8 str1, recode_utf8 str2)
    }}
  ? {{ match !yy_position with
	   "ws" -> raise(WF_error("Whitespace is missing between the literals of the PUBLIC identifier"))
	 | _    -> raise(WF_error("Bad PUBLIC identifier"))
    }}


/* The internal subset: "[" declaration* "]". While parsing the declarations
 * the object variable p_internal_subset must be true; however, if there
 * are entity references, this variable must be reset to false during
 * the entity. (See the rule for "declaration" below.)
 */

internal_dtd():
  dtd_begin_entid:    internal_dtd_begin() 
                      declaration()* 
  dtd_end_entid:      internal_dtd_end()
    {{ 
      if dtd_begin_entid != dtd_end_entid then
	raise(Validation_error("Entities not properly nested with internal DTD subset"))
    }}
  ? {{ match !yy_position with
	   "dtd_end_entid" -> raise(WF_error("`]' expected"))
	 | _               -> raise(WF_error("Bad internal DTD subset"))
    }}


internal_dtd_begin():
  Dtd_begin
    {{ assert (not p_internal_subset);
       p_internal_subset <- true }}


internal_dtd_end():
  Dtd_end
    {{ assert p_internal_subset;
       p_internal_subset <- false }}


declaration():
  /* Parses a single declaration (or processing instruction). As side-effect
   * the parsed declaration is stored into the dtd object.
   */
  elementdecl()
    {{ () }}
| attlistdecl()
    {{ () }}
| entid:Decl_entity ws:Ignore Ignore* e:entitydecl(entid)
    {{ () }}
  ? {{ match !yy_position with
	   "ws" -> raise(WF_error("Whitespace is missing after ENTITY")) 
	 | "e"  -> raise(WF_error("Name or `%' expected"))
	 | _    -> raise(WF_error("Bad entity declaration"))
    }}
| notationdecl()
    {{ () }}
| pi: PI
    {{ let target, value = pi in
       let pi = new proc_instruction target value config.encoding in
       dtd # add_pinstr pi
    }}
| Ignore
    {{ () }}
| Comment_begin Comment_material* ce:Comment_end
    {{ () }}
  ? {{ match !yy_position with
	   "ce" -> raise(WF_error("`-->' expected"))
	 | _    -> raise(WF_error("Bad comment"))
    }}
| Begin_entity
  $ {{ (* Set 'p_internal_subset' to 'false' until the matching 'end_entity'
	* rule is parsed. This allows unrestricted usage of parameter entities
	* within declarations of internal entities.
	*)
       let old_p_internal_subset = p_internal_subset in
       p_internal_subset <- false;
    }}
  decl_xmldecl_then_rest()
    {{ (* Restore the old value of 'p_internal_subset'. *)
       p_internal_subset <- old_p_internal_subset;
       ()
    }}
| begin_entid:Conditional_begin
  $ {{ (* Check whether conditional sections are allowed at this position. *)
       if p_internal_subset then 
	 raise(WF_error("Restriction of the internal subset: Conditional sections not allowed"));
     }}
  Ignore*
  cond:conditional_section()  end_entid:Conditional_end
    {{ (* Check whether Conditional_begin and Conditional_end are in the same
	* entity. (This restriction is explained in the file SPECS.)
	*)
       if begin_entid != end_entid then
	 raise(Validation_error("The first and the last token of conditional sections must be in the same entity (additional restriction of this parser)"));
    }}
  ? {{ match !yy_position with
	   "end_entid" -> raise(WF_error("`>]>' expected"))
	 | "cond"      -> raise(WF_error("INCLUDE or IGNORE expected"))
	 | _           -> raise(WF_error("Bad conditional section"))
    }}

/* The tokens INCLUDE/IGNORE are scanned as names, and the selection of the
 * right parsing rule is dynamic.
 * Note that parse_ignored_section is not defined by a grammar rule but
 * by a conventional let-binding above.
 */

conditional_section():
  include_or_ignore:Name
  $ {{ let parsing_function =
	 match include_or_ignore with
	     "INCLUDE"  -> parse_included_section
                           (* invoke rule "included_section" below *)
	   | "IGNORE"   -> parse_ignored_section
                           (* invoke function "parse_ignored_section" *)
	   | _          -> raise(WF_error("INCLUDE or IGNORE expected"))
       in
    }}
  [ parsing_function ] ()  
    {{ () }}
  ? {{ raise(WF_error("Bad conditional section")) }}

included_section():
  Conditional_body declaration()*  
    {{ () }}
| Ignore Ignore* Conditional_body declaration()*  
    {{ () }}


/*************************** ELEMENT DECLARATIONS ********************/

elementdecl():
  /* parses <!ELEMENT ... >. Puts the parsed element type as side-effect into
   * dtd.
   */
  decl_element_entid:      Decl_element 
  $ {{ let extdecl = context.manager # current_entity_counts_as_external in
     }}
  ws1:                     Ignore Ignore*
  name:                    Name 
  ws2:                     Ignore Ignore*
  content_model:           contentspec() 
                           Ignore*
  decl_rangle_entid:       Decl_rangle
    {{
      if decl_element_entid != decl_rangle_entid then
	raise (Validation_error "Entities not properly nested with ELEMENT declaration");
      if extend_dtd then begin
	let el = new dtd_element dtd name in
	(* It is allowed that an <!ATTLIST...>  precedes the corresponding
	 * <!ELEMENT...>. Because of this it is possible that there is already
	 * an element called 'name' in the DTD, and we only must set the content
	 * model of this element.
	 *)
	try
	  dtd # add_element el;
	  el # set_cm_and_extdecl content_model extdecl;
	with
	    Not_found ->  (* means: there is already an element 'name' *)
	      let el' = dtd # element name in
	      el' # set_cm_and_extdecl content_model extdecl;
              (* raises Validation_error if el' already has a content model *)
      end
    }}
  ? {{ match !yy_position with
	   ("ws1"|"ws2")   -> raise(WF_error("Whitespace is missing"))
	 | "name"          -> raise(WF_error("The name of the element is expected here"))
	 | "content_model" -> raise(WF_error("Content model expression expected"))
	 | "decl_rangle_entid" -> raise(WF_error("`>' expected"))
	 | _                   -> raise(WF_error("Bad element type declaration"))
    }}

contentspec():
  /* parses a content model and returns it (type content_model_type) */
  name: Name   /* EMPTY or ANY */
    {{ match name with
	  "EMPTY" -> Empty
	| "ANY"   -> Any
	| _       -> raise(WF_error("EMPTY, ANY, or a subexpression expected"))
    }}
| entid:Lparen  Ignore*  term:mixed_or_regexp(entid)
    {{ term }}
  ? {{ raise(WF_error("Bad content model expression")) }}


/* Many of the following rules have an lparen_entid argument. This is the
 * internal ID of the entity containing the corresponding left parenthesis;
 * by comparing it with the ID of the entity of the right parenthesis the
 * contraint is implemented that both parentheses must be in the same entity.
 */

mixed_or_regexp(lparen_entid):
  re: choice_or_seq(lparen_entid)
    {{ Regexp re }}
| m: mixed(lparen_entid)
    {{ m }}


multiplier():
  /* returns one of the multiplier symbols (?,*,+) */
  Plus
    {{ Plus }}
| Star
    {{ Star }}
| Qmark
    {{ Qmark }}


mixed (lparen_entid) :
                   Pcdata 
                   Ignore*
  material:        mixed_alternatives_top()
    {{ 
      let rest, rparen_entid = material in
      if lparen_entid != rparen_entid then
	raise (Validation_error "Entities not properly nested with parentheses");
      Mixed (MPCDATA :: rest)
    }}
  ? {{ raise(WF_error("Bad content model expression")) }}


mixed_alternatives_top():
  entid: Rparen
    {{ [], entid }}
| entid: RparenStar
    {{ [], entid }}
| Bar Ignore* name:Name Ignore* names:mixed_alternative()* entid:RparenStar
    {{ 
       (MChild name :: names), entid
    }}
  ? {{ match !yy_position with
	   "name"  -> raise(WF_error("Name expected"))
	 | "entid" -> raise(WF_error("`)*' expected"))
	 | _       -> raise(WF_error("Bad content model expression"))
    }}


mixed_alternative() :
  Bar Ignore* name:Name Ignore*
    {{ MChild name }}
  ? {{ match !yy_position with
	   "name" -> raise(WF_error("Name expected"))
	 | _      -> raise(WF_error("Bad content model expression"))
    }}


choice_or_seq (lparen_entid):
  /* parses either a regular expression, or a mixed expression. Returns
   * Mixed spec or Regexp spec (content_model_type).
   * Which kind of expression (regexp or mixed) is being read is recognized
   * after the first subexpression has been parsed; the other subexpressions
   * must be of the same kind.
   */
  re:     cp() 
          Ignore*
  factor: choice_or_seq_factor()
    {{
      let (finalmark,subexpr), rparen_entid = factor in
      if lparen_entid != rparen_entid then
	raise (Validation_error "Entities not properly nested with parentheses");
      (* Check that the other subexpressions are "regexp", too, and
       * merge them with the first.
       *)
      let re' =
	match subexpr with
	    Alt []  ->  re
	  | Alt alt -> Alt (re :: alt)
	  | Seq seq -> Seq (re :: seq)
	  | _       -> assert false
      in
      (* Interpret the finalmark. *)
      match finalmark with
	  Ignore -> re'
	| Plus   -> Repeated1 re'
	| Star   -> Repeated re'
	| Qmark  -> Optional re'
	| _      -> assert false
    }}
  ? {{ raise(WF_error("Bad content model expression")) }}

choice_or_seq_factor():
  /* Parses "|<subexpr>|...)" or ",<subexpr>,...)", both forms optionally
   * followed by ?, *, or +.
   * Returns ((finalmark, expr), rparen_entid), where
   * - finalmark is the character after the right parenthesis or Ignore
   * - expr is either
   *   Alt []              meaning that only ")" has been found
   *   Alt non_empty_list  meaning that the subexpressions are separated by '|'
   *   Seq non_empty_list  meaning that the subexpressions are separated by ','
   */
  entid:Rparen
    {{ (Ignore, Alt []), entid }}
| entid:RparenPlus
    {{ (Plus, Alt []), entid }}
| entid:RparenStar
    {{ (Star, Alt []), entid }}
| entid:RparenQmark
    {{ (Qmark, Alt []), entid }}
| Bar Ignore* re:cp() Ignore* factor:choice_or_seq_factor()
    {{ 
      let (finalmark, subexpr), rparen_entid = factor in
      begin match subexpr with
	  Alt []  -> (finalmark, (Alt [re])), rparen_entid
	| Alt alt -> (finalmark, (Alt (re :: alt))), rparen_entid
	| _       -> raise(WF_error("It is not allowed to mix alternatives and sequences"))
      end
    }}
  ? {{ raise(WF_error("Bad content model expression")) }}
| Comma Ignore* re:cp() Ignore* factor:choice_or_seq_factor()
    {{
      let (finalmark, subexpr), rparen_entid = factor in
      begin match subexpr with
	  Alt []  -> (finalmark, (Seq [re])), rparen_entid
	| Seq seq -> (finalmark, (Seq (re :: seq))), rparen_entid
	| _       -> raise(WF_error("It is not allowed to mix alternatives and sequences"))
      end
    }}
  ? {{ raise(WF_error("Bad content model expression")) }}

cp():
  /* parse either a name, or a parenthesized subexpression "(...)"  */
  name:Name  m:multiplier()?
    {{ match m with
	  None       -> Child name
	| Some Plus  -> Repeated1 (Child name)
	| Some Star  -> Repeated  (Child name)
	| Some Qmark -> Optional  (Child name)
	| _          -> assert false
    }}
  ? {{ raise(WF_error("Bad content model expression")) }}
| entid:Lparen Ignore* m:choice_or_seq(entid)
    {{ m }}
  ? {{ raise(WF_error("Bad content model expression")) }}


/********************* ATTRIBUTE LIST DECLARATION ***********************/

attlistdecl():
  /* parses <!ATTLIST ... >. Enters the attribute list in dtd as side-
   * effect.
   */
  decl_attlist_entid: Decl_attlist 
  $ {{ let extdecl = context.manager # current_entity_counts_as_external in
    }}
  ws1:                Ignore Ignore*
  el_name:            Name 
  ws:                 Ignore? Ignore*
  factor:             attdef_factor()
    {{ 
      let at_list, decl_rangle_entid = factor in

      if decl_attlist_entid != decl_rangle_entid then
	raise (Validation_error "Entities not properly nested with ATTLIST declaration");

      if not ws && at_list <> [] then begin
	match at_list with
	    (name,_,_) :: _ ->
	      (* This is normally impossible, because the lexer demands 
	       * some other token between two names.
	       *)
	      raise(WF_error("Whitespace is missing before `" ^ name ^ "'"));
	  | _ -> assert false
      end;

      if extend_dtd then begin
	let new_el = new dtd_element dtd el_name in
	(* Note that it is allowed that <!ATTLIST...> precedes the corresponding
	 * <!ELEMENT...> declaration. In this case we add the element declaration
	 * already to the DTD but leave the content model unspecified.
	 *)
	let el =
	  try
	    dtd # add_element new_el;
	    new_el
	  with
	      Not_found ->  (* already added *)
		let old_el = dtd # element el_name in
		if old_el # attribute_names <>  [] then
		  config.warner # warn ("More than one ATTLIST declaration for element type `" ^
					el_name ^ "'");
		old_el
	in
	List.iter
	  (fun (a_name, a_type, a_default) ->
	     el # add_attribute a_name a_type a_default extdecl)
	  at_list
      end
    }}
  ? {{ match !yy_position with
	   "ws1"     -> raise(WF_error("Whitespace is missing after ATTLIST"))
	 | "el_name" -> raise(WF_error("The name of the element is expected here"))
	 | "factor"  -> raise(WF_error("Another attribute name or `>' expected"))
	 | _         -> raise(WF_error("Bad attribute declaration"))
    }}


attdef_factor():
  /* parses a list of triples <name> <type> <default value> and returns the
   * list as (string * att_type * att_default) list.
   */
  attdef:attdef()   ws:Ignore?   Ignore*   factor:attdef_factor()
    {{ 
      let attdef_rest, decl_rangle_entid = factor in
      if not ws && attdef_rest <> [] then begin
	match attdef_rest with
	    (name,_,_) :: _ ->
	      raise(WF_error("Missing whitespace before `" ^ name ^ "'"));
	  | _ -> assert false
      end;
      (attdef :: attdef_rest), decl_rangle_entid }}
  ? {{ match !yy_position with
	 | "factor"  -> raise(WF_error("Another attribute name or `>' expected"))
	 | _         -> raise(WF_error("Bad attribute declaration"))
    }}
| entid:Decl_rangle
    {{ [], entid }}


attdef():
  /* Parses a single triple */
  name:     Name 
  ws1:      Ignore Ignore*
  tp:       atttype() 
  ws2:      Ignore Ignore*
  default:  defaultdecl()
    {{ (name,tp,default) }}
  ? {{ match !yy_position with
	   ("ws1"|"ws2") -> raise(WF_error("Whitespace is missing"))
	 | "tp"          -> raise(WF_error("Type of attribute or `(' expected"))
	 | "default"     -> raise(WF_error("#REQUIRED, #IMPLIED, #FIXED or a string literal expected"))
	 | _             -> raise(WF_error("Bad attribute declaration"))
    }}

atttype():
  /* Parses an attribute type and returns it as att_type. */
  name:      Name 
  $ {{ let followup = 
	 if name = "NOTATION" then 
	   parse_notation
	 else
	   parse_never
       in
     }}
  nota:      [followup]()?
    {{ 
       match name with
	  "CDATA"    -> A_cdata
	| "ID"       -> A_id
	| "IDREF"    -> A_idref
	| "IDREFS"   -> A_idrefs
	| "ENTITY"   -> A_entity
	| "ENTITIES" -> A_entities
	| "NMTOKEN"  -> A_nmtoken
	| "NMTOKENS" -> A_nmtokens
	| "NOTATION" ->
	    (match nota with
		 None   -> raise(WF_error("Error in NOTATION type (perhaps missing whitespace after NOTATION?)"))
	       | Some n -> n
            )
	| _          -> raise(WF_error("One of CDATA, ID, IDREF, IDREFS, ENTITY, ENTITIES, NMTOKEN, NMTOKENS, NOTATION, or a subexpression expected"))
    }}
  ? {{ raise(WF_error("Bad attribute declaration (perhaps missing whitespace after NOTATION)")) }}

|         Lparen 
          Ignore* 
  name:   name_or_nametoken() 
          Ignore* 
  names:  nmtoken_factor()* 
  rp:     Rparen
    /* Enumeration */
    {{ A_enum(name :: names) }}
  ? {{ match !yy_position with
	   "name"  -> raise(WF_error("Name expected"))
	 | "names" -> raise(WF_error("`|' and more names expected, or `)'"))
	 | "rp"    -> raise(WF_error("`|' and more names expected, or `)'"))
	 | _       -> raise(WF_error("Bad enumeration type"))
    }}


never():
  /* The always failing rule */
  $ {{ raise Not_found; }}
  Doctype   /* questionable */
    {{ A_cdata    (* Does not matter *)
    }}


notation():
         Ignore Ignore*
  lp:    Lparen 
         Ignore*
  name:  Name 
         Ignore* 
  names: notation_factor()* 
  rp:    Rparen
    {{ A_notation(name :: names) }}
  ? {{ match !yy_position with
	   "lp"    -> raise(WF_error("`(' expected"))
	 | "name"  -> raise(WF_error("Name expected"))
	 | "names" -> raise(WF_error("`|' and more names expected, or `)'"))
	 | "rp"    -> raise(WF_error("`|' and more names expected, or `)'"))
	 | _       -> raise(WF_error("Bad NOTATION type"))
    }}


notation_factor():
  /* Parse "|<name>" and return the name */
  Bar Ignore* name:Name Ignore*
    {{ name }}
  ? {{ match !yy_position with
	   "name" -> raise(WF_error("Name expected"))
	 | _      -> raise(WF_error("Bad NOTATION type"))
    }}

nmtoken_factor():
  /* Parse "|<nmtoken>" and return the nmtoken */
  Bar Ignore* n:name_or_nametoken() Ignore*
    {{ n }}
  ? {{ match !yy_position with
	   "n" -> raise(WF_error("Nametoken expected"))
	 | _   -> raise(WF_error("Bad enumeration type"))
    }}


name_or_nametoken():
  n:Name      {{ n }}
| n:Nametoken {{ n }}


/* The default values must be expanded and normalized. This has been implemented
 * by the function expand_attvalue.
 */


defaultdecl():
  /* Parse the default value for an attribute and return it as att_default */
  Required
    {{ D_required }}
| Implied
    {{ D_implied }}
| Fixed ws:Ignore Ignore* str:Unparsed_string
    {{ D_fixed (expand_attvalue lexerset dtd str config.warner false) }}
  ? {{ match !yy_position with
	   "ws"  -> raise(WF_error("Whitespace is missing after #FIXED"))
	 | "str" -> raise(WF_error("String literal expected"))
	 | _     -> raise(WF_error("Bad #FIXED default value"))
    }}
| str:Unparsed_string
    {{ D_default (expand_attvalue lexerset dtd str config.warner false) }}


/**************************** ENTITY DECLARATION ***********************/

entitydecl(decl_entity_entid):
  /* parses everything _after_ <!ENTITY until the matching >. The parsed 
   * entity declaration is entered into the dtd object as side-effect.
   */
  name:               Name 
  $ {{ let extdecl = context.manager # current_entity_counts_as_external in
    }}
  ws:                 Ignore Ignore* 
  material:           entitydef() 
                      Ignore*
  decl_rangle_entid:  Decl_rangle     
    /* A general entity */
    {{
       if decl_entity_entid != decl_rangle_entid then
	 raise (Validation_error "Entities not properly nested with ENTITY declaration");
      let en =
	(* Distinguish between
         * - internal entities
         * - external entities
         * - NDATA (unparsed) entities
         *)
	match material with
	    (Some s, None,     None)   ->
	      new internal_entity dtd name config.warner s p_internal_subset
		  config.errors_with_line_numbers false config.encoding
	  | (None,   Some xid, None)   ->
	      new external_entity (resolver # clone) dtd name config.warner
                                  xid false config.errors_with_line_numbers
		                  config.encoding

	  | (None,   Some xid, Some n) ->
	      (new ndata_entity name xid n config.encoding :> entity)
	  | _ -> assert false
      in
      dtd # add_gen_entity en extdecl
    }}
  ? {{ match !yy_position with
	   "ws"                -> raise(WF_error("Whitespace is missing"))
	 | "material"          -> raise(WF_error("String literal or identifier expected"))
	 | "decl_rangle_entid" -> raise(WF_error("`>' expected"))
	 | _                   -> raise(WF_error("Bad entity declaration"))
    }}

|                     Percent 
  $ {{ let extdecl = context.manager # current_entity_counts_as_external in
    }}
  ws1:                Ignore Ignore* 
  name:               Name 
  ws2:                Ignore Ignore* 
  material:           pedef() 
                      Ignore* 
  decl_rangle_entid:  Decl_rangle
    /* A parameter entity */
    {{ 
      if decl_entity_entid != decl_rangle_entid then
	 raise (Validation_error "Entities not properly nested with ENTITY declaration");
      let en =
	(* Distinguish between internal and external entities *)
	match material with
	    (Some s, None)   ->
	      new internal_entity dtd name config.warner s p_internal_subset
  		  config.errors_with_line_numbers true config.encoding
	  | (None,   Some xid)   ->
	      new external_entity (resolver # clone) dtd name config.warner
                                  xid true config.errors_with_line_numbers
		                  config.encoding
	  | _ -> assert false
      in

      (* The following two lines force that even internal entities count
       * as external (for the standalone check) if the declaration of 
       * the internal entity occurs in an external entity.
       *)
      if extdecl then
	en # set_counts_as_external;

      dtd # add_par_entity en;
    }}
  ? {{ match !yy_position with
	   ("ws1"|"ws2")       -> raise(WF_error("Whitespace is missing"))
	 | "material"          -> raise(WF_error("String literal or identifier expected"))
	 | "decl_rangle_entid" -> raise(WF_error("`>' expected"))
	 | _                   -> raise(WF_error("Bad entity declaration"))
    }}


entitydef():
  /* parses the definition value of a general entity. Returns either:
   * - (Some s, None,   None)    meaning the definition of an internal entity
   *                               with (literal) value s has been found
   * - (None,   Some x, None)    meaning that an external parsed entity with
   *                               external ID x has been found
   * - (None,   Some x, Some n)  meaning that an unparsed entity with
   *                               external ID x and notations n has been found
   */
  str:Unparsed_string
    {{ Some str, None, None }}
| id:external_id()   ws:Ignore?  Ignore*  decl:ndatadecl()?
    {{  if not ws  && decl <> None then
	  raise(WF_error("Whitespace missing before `NDATA'"));
	None, Some id, decl 
    }}


pedef():
  /* parses the definition value of a parameter entity. Returns either:
   * - (Some s, None)     meaning that the definition of an internal entity
   *                        with (literal) value s has been found
   * - (None,   Some x)   meaning that an external ID x has been found
   */
  str:Unparsed_string
    {{ Some str, None }}
| id:external_id()
    {{ None, Some id }}


ndatadecl():
  /* Parses either NDATA "string" or the empty string; returns Some "string"
   * in the former, None in the latter case.
   */
  ndata:Name ws:Ignore Ignore* name:Name
    {{ if ndata = "NDATA" then
	name
      else
	raise(WF_error("NDATA expected"))
    }}
  ? {{ match !yy_position with
	   "ws"   -> raise(WF_error("Whitespace is missing after NDATA"))
	 | "name" -> raise(WF_error("Name expected"))
	 | _      -> raise(WF_error("Bad NDATA declaration"))
    }}

/**************************** NOTATION DECLARATION *******************/

notationdecl():
  /* parses <!NOTATION ... > and enters the notation declaration into the
   * dtd object as side-effect
   */
  decl_notation_entid: Decl_notation 
  ws1:                 Ignore Ignore*
  name:                Name 
  ws2:                 Ignore Ignore*
  sys_or_public:       Name /* SYSTEM or PUBLIC */ 
  ws3:                 Ignore Ignore*
  str1:                Unparsed_string 
  ws:                  Ignore? Ignore*
  str2:                Unparsed_string? 
                       Ignore*
  decl_rangle_entid:   Decl_rangle
    {{ 
      if decl_notation_entid != decl_rangle_entid then
	raise (Validation_error "Entities not properly nested with NOTATION declaration");
      let xid =
	(* Note that it is allowed that PUBLIC is only followed by one
	 * string literal
	 *)
	match sys_or_public with
	    "SYSTEM" ->
	      if str2 <> None then raise(WF_error("SYSTEM must be followed only by one argument"));
	      System (recode_utf8 str1)
	  | "PUBLIC" ->
	      begin match str2 with
		  None ->
		    check_public_id str1;
		    Public(recode_utf8 str1,"")
		| Some p ->
		    if not ws then
		      raise(WF_error("Missing whitespace between the string literals of the `PUBLIC' id"));
		    check_public_id str1;
		    Public(recode_utf8 str1, recode_utf8 p)
	      end
	  | _ -> raise(WF_error("PUBLIC or SYSTEM expected"))
      in
      if extend_dtd then begin
	let no = new dtd_notation name xid config.encoding in
	dtd # add_notation no
      end
    }}
  ? {{ match !yy_position with
	   ("ws1"|"ws2"|"ws3") -> raise(WF_error("Whitespace is missing"))
	 | "name"              -> raise(WF_error("Name expected"))
	 | "sys_or_public"     -> raise(WF_error("SYSTEM or PUBLIC expected"))
	 | ("str1"|"str2")     -> raise(WF_error("String literal expected"))
	 | "decl_rangle_entid" -> raise(WF_error("`>' expected"))
	 | _                   -> raise(WF_error("Bad NOTATION declaration"))
    }}

/****************************** ELEMENTS **************************/

/* In the following rules, the number of error rules is reduced to
 * improve the performance of the parser.
 */


contents_start():
  /* parses <element>...</element> misc*, i.e. exactly one element followed
   * optionally by white space or processing instructions.
   * The element is entered into the global variables as follows:
   * - If elstack is non-empty, the parsed element is added as new child to
   *   the top element of the stack.
   * - If elstack is empty, the root_examplar object is modified rather than
   *   that a new element is created. If additionally the variable root is
   *   None, it is assigned Some root_examplar.
   * Note that the modification of the root_exemplar is done by the method
   * internal_init.
   * The reason why the root element is modified rather than newly created
   * is a typing requirement. It must be possible that the class of the root
   * is derived from the original class element_impl, i.e. the user must be
   * able to add additional methods. If we created a new root object, we
   * would have to denote to which class the new object belongs; the root
   * would always be an 'element_impl' object (and not a derived object).
   * If we instead cloned an  exemplar object and modified it by the
   * "create" method, the root object would belong to the same class as the
   * exemplar (good), but the type of the parsing function would always
   * state that an 'element_impl' was created (because we can pass the new
   * object only back via a global variable). The only solution is to
   * modify the object that has been passed to the parsing function directly.
   */
  $ {{ dtd <- transform_dtd dtd; }}
  start_tag() content()*
    {{ () }}


content():
  /* parses: start tags, end tags, content, or processing
   * instructions. That the tags are properly nested is dynamically checked.
   * As result, recognized elements are added to their parent elements,
   * content is added to the element containing it, and processing instructions
   * are entered into the element embracing them. (All as side-effects.)
   */
  start_tag()
    {{ () }}
| end_tag()
    {{ () }}
| char_data()
    {{ () }}
| cref()
    {{ () }}
| pi()
    {{ () }}
| entity_ref()
    {{ () }}
| comment()
    {{ () }}


entity_ref():
   Begin_entity eref_xmldecl_then_rest()
    {{ if n_tags_open = 0 then
	raise(WF_error("Entity reference not allowed here"))
    }}


/* See comment for doc_mldecl_then_misc_then_prolog_then_rest. */

eref_xmldecl_then_rest():
  pl:PI_xml
  $ {{ context.manager # current_entity # process_xmldecl pl; 
    }}
  content()* End_entity
    {{ () }}

| $ {{ context.manager # current_entity # process_missing_xmldecl; }}
  content() content()* End_entity
    {{ () }}

| $ {{ context.manager # current_entity # process_missing_xmldecl; }}
  End_entity
    {{ () }}


start_tag():
  /* parses <element attribute-values> or <element attribute-values/>.
   *
   * EFFECT: If elstack is non-empty, the element is added to the
   * top element of the stack as new child, and the element
   * is pushed on the stack. If elstack is empty, the root_exemplar is
   * modified and gets the parsed name and attribute list. The root_exemplar
   * is pushed on the stack. If additionally the variable root is empty, too,
   * this variable is initialized.
   * If the <element ... /> form has been parsed, no element is pushed
   * on the stack.
   */
  tag:        Tag_beg
    $ {{ let position =
	   if config.store_element_positions then
	     Some(context.manager # position)
	   else
	     None
	 in
       }}
  ws:         Ignore? Ignore*
  attlist:    attribute()* 
  emptiness:  start_tag_rangle()
  /* Note: it is guaranteed that there is whitespace between Tag_beg and
   * the name of the first attribute, because there must be some separator.
   * So we need not to check ws!
   */
    {{ 
      let rec check_attlist al =
	match al with
	    (nv1, num1) :: al' ->
	      if not num1 && al' <> [] then begin
		match al with
		    ((n1,_),_) :: ((n2,_),_) :: _ ->
		      raise(WF_error("Whitespace is missing between attributes `" ^
				     n1 ^ "' and `" ^ n2 ^ "'"))
		  | _ -> assert false
	      end;
	      check_attlist al'
	  | [] -> ()
      in
      check_attlist attlist;
		
      let name, tag_beg_entid = tag in
      let attlist' = List.map (fun (nv,_) -> nv) attlist in
      let d =
	create_element_node ?position:position spec dtd name attlist' in

      begin match id_index with
	  None -> ()
	| Some idx ->
	    (* Put the ID attribute into the index, if present *)
	    begin try 
	      let v = d # id_attribute_value in  (* may raise Not_found *)
	      idx # add v d                      (* may raise ID_not_unique *)
	    with
		Not_found ->
		  (* No ID attribute *)
		  ()
	      | ID_not_unique ->
		  (* There is already an ID with the same value *)
		  raise(Validation_error("ID not unique"))
	    end
      end;

      if n_tags_open = 0 then begin
	if root = None then begin
	  (* We have found the begin tag of the root element. *)
	  if config.enable_super_root_node then begin
	    (* The user wants the super root instead of the real root.
	     * The real root element becomes the child of the VR.
	     *)
	    (* Assertion: self # current is the super root *)
	    assert (self # current # node_type = T_super_root);
	    root <- Some (self # current);
	    self # current # add_node d;
	    doc # init_root (self # current);
	  end
	  else begin
	    (* Normal behaviour: The user wants to get the real root. *)
	    root <- Some d;
	    doc # init_root d;
	  end;
	end
	else
	  (* We have found a second topmost element. This is illegal. *)
	  raise(WF_error("Document must consist of only one toplevel element"))
      end
      else begin
	(* We have found some inner begin tag. *)
	self # save_data;        (* Save outstanding data material first *)
	self # current # add_node d
      end;

      if emptiness then
	(* An empty tag like <a/>. *)
	d # local_validate ~use_dfa:config.validate_by_dfa ()
      else begin
	(* A non-empty tag. *)
	Stack.push (d, tag_beg_entid) elstack;
	n_tags_open <- n_tags_open + 1;
      end;
    }}
  ? {{ match !yy_position with
	   "attlist"   -> raise(WF_error("Bad attribute list"))
	 | "emptiness" -> raise(WF_error("`>' or `/>' expected"))
	 | _           -> raise(WF_error("Bad start tag"))
    }}


attribute():
  /* Parses name="value"  */
  n:Name Ignore* Eq Ignore* v:attval() ws:Ignore? Ignore*
    {{ (n,v), ws }}


attval():
  v:Attval
    {{ expand_attvalue lexerset dtd v config.warner true }}
| v:Attval_nl_normalized
    {{ expand_attvalue lexerset dtd v config.warner false }}


start_tag_rangle():
  Rangle       {{ false }}
| Rangle_empty {{ true }}


end_tag():
  /* parses </element>.
   * Pops the top element from the elstack and checks if it is the same
   * element.
   */
  tag:Tag_end  Ignore*  Rangle
    {{ let name, tag_end_entid = tag in
       if n_tags_open = 0 then
	 raise(WF_error("End-tag without start-tag"));

       self # save_data;        (* Save outstanding data material first *)

       let x, tag_beg_entid = Stack.pop elstack in
       let x_name =
	 match x # node_type with
	   | T_element n -> n
	   | _ -> assert false
       in
       if name <> x_name then
	 raise(WF_error("End-tag does not match start-tag"));
       if tag_beg_entid != tag_end_entid then
	 raise(WF_error("End-tag not in the same entity as the start-tag"));
       x # local_validate ~use_dfa:config.validate_by_dfa ();
       
       n_tags_open <- n_tags_open - 1;
       
       assert (n_tags_open >= 0);

    }}

char_data():
  /* Parses any literal characters not otherwise matching, and adds the
   * characters to the top element of elstack.
   * If elstack is empty, it is assumed that there is no surrounding
   * element, and any non-white space character is forbidden.
   */
  data:CharData
    {{ 
      if n_tags_open = 0 then
	(* only white space is allowed *)
	self # only_whitespace data
      else
	self # collect_data data
          (* We collect the chardata material until the next end tag is
	   * reached. Then the collected material will concatenated and
	   * stored as a single T_data node (see end_tag rule above)
	   * using save_data.
	   *)
    }}
| data:Cdata
    {{ 
      if n_tags_open = 0 then
	raise (WF_error("CDATA section not allowed here"));
      self # collect_data data
          (* Also collect CDATA material *)
    }}

cref():
  /* Parses &#...; and adds the character to the top element of elstack. */
  code:CRef
    {{ 
       if n_tags_open = 0 then
	 (* No surrounding element: character references are not allowed *)
	 raise(WF_error("Character reference not allowed here"));
       self # collect_data (character config.encoding config.warner code)
          (* Also collect character references *)
    }}

pi():
  /* Parses <?...?> (but not <?xml white-space ... ?>).
   * If there is a top element in elstack, the processing instruction is added
   * to this element.
   */
  pi: PI
    {{ 
      let position =
	if config.store_element_positions then
	  Some(context.manager # position)
	else
	  None
      in
      let target,value = pi in

      if n_tags_open = 0 & not config.enable_super_root_node
      then
	doc # add_pinstr (new proc_instruction target value config.encoding)
      else begin
	(* Special case: if processing instructions are processed inline,
	 * they are wrapped into T_pinstr nodes.
	 *)
	if config.enable_pinstr_nodes then begin
	  self # save_data;        (* Save outstanding data material first *)
	  let pinstr = new proc_instruction target value config.encoding in
	  let wrapper = create_pinstr_node 
			  ?position:position spec dtd pinstr in
	  wrapper # local_validate();                (* succeeds always   *)
	  self # current # add_node wrapper;
	end
	else
	  (* Normal behaviour: Add the PI to the parent element. *)
	  self # current # add_pinstr 
	                     (new proc_instruction target value config.encoding)
      end
    }}


comment():
  /* Parses <!-- ... -->
   */
  Comment_begin
  $ {{ 
      let position =
	if config.enable_comment_nodes && config.store_element_positions then
	  Some(context.manager # position)
	else
	  None
      in
    }}
  mat: Comment_material*
  ce: Comment_end
    {{
      if config.enable_comment_nodes then begin
	self # save_data;        (* Save outstanding data material first *)
	let comment_text = String.concat "" mat in
	let wrapper = create_comment_node 
		        ?position:position spec dtd comment_text in
	wrapper # local_validate();                (* succeeds always   *)
	self # current # add_node wrapper;
      end
    }}
  ? {{ match !yy_position with
	 | "ce"  -> raise(WF_error("`-->' expected"))
	 | _     -> raise(WF_error("Bad comment"))
    }}


%%
   (* The method "parse" continues here... *)

   try
     match start_symbol with
	 Ext_document ->
	   parse_ext_document context.current context.get_next 
       | Ext_declarations ->
	   parse_ext_declarations context.current context.get_next 
       | Ext_element ->
	   parse_ext_element context.current context.get_next
   with
       Not_found ->
	 raise Parsing.Parse_error

  (*********** The method "parse" ends here *************)


(**********************************************************************)

(* Here ends the class definition: *)
end
;;

(**********************************************************************)

open Pxp_reader;;


class default_ext =
  object(self)
    val mutable node = (None : ('a extension node as 'a) option)
    method clone = {< >}
    method node =
      match node with
	  None ->
	    assert false
	| Some n -> n
    method set_node n =
      node <- Some n
  end
;;


let default_extension = new default_ext;;

let default_spec =
  make_spec_from_mapping
    ~super_root_exemplar:      (new element_impl default_extension)
    ~comment_exemplar:         (new element_impl default_extension)
    ~default_pinstr_exemplar:  (new element_impl default_extension)
    ~data_exemplar:            (new data_impl default_extension)
    ~default_element_exemplar: (new element_impl default_extension)
    ~element_mapping:          (Hashtbl.create 1)
    ()
;;


let idref_pass id_index root =
  let error t att value =
    let name =
      match t # node_type with
	  T_element name -> name
	| _ -> assert false
    in
    let text =
      "Attribute `" ^ att ^ "' of element `" ^ name ^ 
      "' refers to unknown ID `" ^ value ^ "'" in
    let pos_ent, pos_line, pos_col = t # position in
    if pos_line = 0 then
      raise(Validation_error text)
    else
      raise(At("In entity " ^ pos_ent ^ " at line " ^
	       string_of_int pos_line ^ ", position " ^ string_of_int pos_col ^
	       ":\n",
	       Validation_error text))
  in
    
  let rec check_tree t =
    let idref_atts = t # idref_attribute_names in
    List.iter
      (fun att ->
	 match t # attribute att with
	     Value s ->
	       begin try ignore(id_index # find s) with
		   Not_found ->
		     error t att s
	       end
	   | Valuelist l ->
	       List.iter
		 (fun s ->
		    try ignore(id_index # find s) with
			Not_found ->
			  error t att s
		 )
		 l
	   | Implied_value -> ()
      )
      idref_atts;
    List.iter check_tree (t # sub_nodes)
  in
  check_tree root
;;


exception Return_DTD of dtd;;
  (* Used by extract_dtd_from_document_entity to jump out of the parser *)


let call_parser ~configuration:cfg 
                ~source:src 
		~dtd 
		~extensible_dtd 
		~document:doc 
		~specification:spec 
		~process_xmldecl 
		~transform_dtd
                ~(id_index : 'ext #index option)
		~use_document_entity
                ~entry 
		~init_lexer =
  let e = cfg.errors_with_line_numbers in
  let w = cfg.warner in
  let r, en =
    match src with
	Entity(m,r')  -> r', m dtd
      | ExtID(xid,r') -> r', 
	                 if use_document_entity then
                           new document_entity 
			     r' dtd "[toplevel]" w xid e
                             cfg.encoding
			 else
                           new external_entity 
			     r' dtd "[toplevel]" w xid false e
                             cfg.encoding
  in
  r # init_rep_encoding cfg.encoding;
  r # init_warner w;
  en # set_debugging_mode (cfg.debugging_mode);
  let pobj =
    new parser_object
      doc
      dtd
      extensible_dtd
      cfg
      r
      spec
      process_xmldecl
      transform_dtd
      (id_index :> 'ext index option)
  in
  let mgr = new entity_manager en in
  en # open_entity true init_lexer;
  begin try
    let context = make_context mgr in
    pobj # parse context entry;
    ignore(en # close_entity);
  with
      Return_DTD d ->
	ignore(en # close_entity);
	raise(Return_DTD d)
    | error ->
	ignore(en # close_entity);
	r # close_all;
	let pos = mgr # position_string in
	raise (At(pos, error))
  end;
  if cfg.idref_pass then begin
    match id_index with
	None -> ()
      | Some idx ->
	  ( match pobj # root with
		None -> ()
	      | Some root ->
		  idref_pass idx root;
	  )
  end;
  pobj


let parse_dtd_entity cfg src =
  (* Parse a DTD given as separate entity. *)
  let dtd = new dtd cfg.warner cfg.encoding in
  let doc = new document cfg.warner in
  let pobj =
    call_parser 
      ~configuration:cfg 
      ~source:src 
      ~dtd:dtd 
      ~extensible_dtd:true         (* Extend the DTD by parsed declarations *)
      ~document:doc 
      ~specification:default_spec 
      ~process_xmldecl:false       (* The XML declaration is ignored 
				    * (except 'encoding') 
				    *)
      ~transform_dtd:(fun x -> x)  (* Do not transform the DTD *)
      ~id_index: None
      ~use_document_entity:false
      ~entry:Ext_declarations      (* Entry point of the grammar *)
      ~init_lexer:Declaration      (* The initially used lexer *)
  in
  dtd # validate;
  if cfg.accept_only_deterministic_models then dtd # only_deterministic_models;
  dtd
;;


let parse_content_entity ?id_index cfg src dtd spec =
  (* Parse an element given as separate entity *)
  dtd # validate;            (* ensure that the DTD is valid *)
  if cfg.accept_only_deterministic_models then dtd # only_deterministic_models;
  let doc = new document cfg.warner in
  let pobj =
    call_parser
      ~configuration:cfg 
      ~source:src 
      ~dtd:dtd 
      ~extensible_dtd:true         (* Extend the DTD by parsed declarations *)
      ~document:doc 
      ~specification:spec 
      ~process_xmldecl:false       (* The XML declaration is ignored 
				    * (except 'encoding') 
				    *)
      ~transform_dtd:(fun x -> x)  (* Do not transform the DTD *)
      ~id_index:(id_index :> 'ext index option)
      ~use_document_entity:false
      ~entry:Ext_element           (* Entry point of the grammar *)
      ~init_lexer:Content          (* The initially used lexer *)
  in
  match pobj # root with
      Some r -> r
    | None -> raise(WF_error("No root element"))
;;


let parse_wfcontent_entity cfg src spec =
  let dtd = new dtd cfg.warner cfg.encoding in
  dtd # allow_arbitrary;
  let doc = new document cfg.warner in
  let pobj =
    call_parser
      ~configuration:cfg 
      ~source:src 
      ~dtd:dtd 
      ~extensible_dtd:false        (* Do not extend the DTD *)
      ~document:doc 
      ~specification:spec 
      ~process_xmldecl:false       (* The XML declaration is ignored 
				    * (except 'encoding') 
				    *)
      ~transform_dtd:(fun x -> x)  (* Do not transform the DTD *)
      ~id_index:None
      ~use_document_entity:false
      ~entry:Ext_element           (* Entry point of the grammar *)
      ~init_lexer:Content          (* The initially used lexer *)
  in
  match pobj # root with
      Some r -> r
    | None -> raise(WF_error("No root element"))
;;


let iparse_document_entity ?(transform_dtd = (fun x -> x)) 
                           ?id_index
                           cfg0 src spec p_wf =
  (* Parse an element given as separate entity *)
  (* p_wf: 'true' if in well-formedness mode, 'false' if in validating mode *)
  let cfg = { cfg0 with
		recognize_standalone_declaration = 
                   cfg0.recognize_standalone_declaration && (not p_wf) 
            } in
  let dtd = new dtd cfg.warner cfg.encoding in
  if p_wf then
    dtd # allow_arbitrary;
  let doc = new document cfg.warner in
  let pobj =
    call_parser 
      ~configuration:cfg 
      ~source:src 
      ~dtd:dtd 
      ~extensible_dtd:(not p_wf)   (* Extend the DTD by parsed declarations
				    * only if in validating mode
				    *)
      ~document:doc 
      ~specification:spec 
      ~process_xmldecl:true        (* The XML declaration is processed *)
                                   (* TODO: change to 'not p_wf' ? *)
      ~transform_dtd:(fun dtd -> 
			let dtd' = transform_dtd dtd in
			if cfg.accept_only_deterministic_models then 
			  dtd' # only_deterministic_models;
			dtd')

      ~id_index:(id_index :> 'ext index option)
      ~use_document_entity:true
      ~entry:Ext_document          (* Entry point of the grammar *)
      ~init_lexer:Document         (* The initially used lexer *)
  in
  pobj # doc
;;


let parse_document_entity ?(transform_dtd = (fun x -> x)) 
                          ?id_index
                          cfg src spec =
  iparse_document_entity 
    ~transform_dtd:transform_dtd 
    ?id_index:(id_index : 'ext #index option :> 'ext index option)
    cfg src spec false;;

let parse_wfdocument_entity cfg src spec =
  iparse_document_entity cfg src spec true;;

let extract_dtd_from_document_entity cfg src =
  let transform_dtd dtd = raise (Return_DTD dtd) in
  try
    let doc = parse_document_entity 
		~transform_dtd:transform_dtd
		cfg
		src
		default_spec in
    (* Should not happen: *)
    doc # dtd
  with
      Return_DTD dtd ->
	(* The normal case: *)
	dtd
;;


let default_config =
  let w = new drop_warnings in
  { warner = w;
    errors_with_line_numbers = true;
    enable_pinstr_nodes = false;
    enable_super_root_node = false;
    enable_comment_nodes = false;
    encoding = `Enc_iso88591;
    recognize_standalone_declaration = true;
    store_element_positions = true;
    idref_pass = false;
    validate_by_dfa = true;
    accept_only_deterministic_models = true;
    debugging_mode = false;
  }


class  [ 'ext ] hash_index =
object 
  constraint 'ext = 'ext node #extension
  val ht = (Hashtbl.create 100 : (string, 'ext node) Hashtbl.t)
  method add s n =
    try
      ignore(Hashtbl.find ht s);
      raise ID_not_unique
    with
	Not_found ->
	  Hashtbl.add ht s n

  method find s = Hashtbl.find ht s
  method index = ht
end


(* ======================================================================
 * History:
 *
 * $Log$
 * Revision 1.1  2000/11/17 09:57:29  lpadovan
 * Initial revision
 *
 * Revision 1.14  2000/08/26 23:23:14  gerd
 * 	Bug: from_file must not interpret the file name as URL path.
 * 	Bug: When PI and comment nodes are generated, the collected data
 * material must be saved first.
 *
 * Revision 1.13  2000/08/19 21:30:03  gerd
 * 	Improved the error messages of the parser
 *
 * Revision 1.12  2000/08/18 20:16:25  gerd
 * 	Implemented that Super root nodes, pinstr nodes and comment
 * nodes are included into the document tree.
 *
 * Revision 1.11  2000/08/14 22:24:55  gerd
 * 	Moved the module Pxp_encoding to the netstring package under
 * the new name Netconversion.
 *
 * Revision 1.10  2000/07/23 02:16:33  gerd
 * 	Support for DFAs.
 *
 * Revision 1.9  2000/07/14 13:57:29  gerd
 * 	Added the id_index feature.
 *
 * Revision 1.8  2000/07/09 17:52:45  gerd
 * 	New implementation for current_data.
 * 	The position of elements is stored on demand.
 *
 * Revision 1.7  2000/07/09 01:00:35  gerd
 * 	Improvement: It is now guaranteed that only one data node
 * is added for consecutive character material.
 *
 * Revision 1.6  2000/07/08 16:27:29  gerd
 * 	Cleaned up the functions calling the parser.
 * 	New parser argument: transform_dtd.
 * 	Implementations for 'extract_dtd_from_document_entity' and
 * 'parse_wfcontent_entity'.
 *
 * Revision 1.5  2000/07/06 23:05:18  gerd
 * 	Initializations of resolvers were missing.
 *
 * Revision 1.4  2000/07/06 22:11:01  gerd
 * 	Fix: The creation of the non-virtual root element is protected
 * in the same way as the virtual root element.
 *
 * Revision 1.3  2000/07/04 22:15:18  gerd
 * 	Change: Using the new resolver capabilities.
 * 	Still incomplete: the new extraction and parsing functions.
 *
 * Revision 1.2  2000/06/14 22:19:06  gerd
 * 	Added checks such that it is impossible to mix encodings.
 *
 * Revision 1.1  2000/05/29 23:48:38  gerd
 * 	Changed module names:
 * 		Markup_aux          into Pxp_aux
 * 		Markup_codewriter   into Pxp_codewriter
 * 		Markup_document     into Pxp_document
 * 		Markup_dtd          into Pxp_dtd
 * 		Markup_entity       into Pxp_entity
 * 		Markup_lexer_types  into Pxp_lexer_types
 * 		Markup_reader       into Pxp_reader
 * 		Markup_types        into Pxp_types
 * 		Markup_yacc         into Pxp_yacc
 * See directory "compatibility" for (almost) compatible wrappers emulating
 * Markup_document, Markup_dtd, Markup_reader, Markup_types, and Markup_yacc.
 *
 * ======================================================================
 * Old logs from markup_yacc.m2y:
 *
 * Revision 1.9  2000/05/29 21:14:57  gerd
 * 	Changed the type 'encoding' into a polymorphic variant.
 *
 * Revision 1.8  2000/05/27 19:26:19  gerd
 * 	Change: The XML declaration is interpreted right after
 * it has been parsed (no longer after the document): new function
 * check_and_parse_xmldecl.
 * 	When elements, attributes, and entities are declared
 * it is stored whether the declaration happens in an external
 * entity (for the standalone check).
 * 	The option recognize_standalone_declaration is interpreted.
 *
 * Revision 1.7  2000/05/20 20:31:40  gerd
 * 	Big change: Added support for various encodings of the
 * internal representation.
 *
 * Revision 1.6  2000/05/14 21:51:24  gerd
 * 	Change: Whitespace is handled by the grammar, and no longer
 * by the entity.
 *
 * Revision 1.5  2000/05/14 17:50:54  gerd
 * 	Updates because of changes in the token type.
 *
 * Revision 1.4  2000/05/11 22:09:17  gerd
 * 	Fixed the remaining problems with conditional sections.
 * This seems to be also a weakness of the XML spec!
 *
 * Revision 1.3  2000/05/09 00:02:44  gerd
 * 	Conditional sections are now recognized by the parser.
 * There seem some open questions; see the TODO comments!
 *
 * Revision 1.2  2000/05/08 22:01:44  gerd
 * 	Introduced entity managers (see markup_entity.ml).
 * 	The XML declaration is now recognized by the parser. If such
 * a declaration is found, the method process_xmldecl of the currently
 * active entity is called. If the first token is not an XML declaration,
 * the method process_missing_xmldecl is called instead.
 * 	Some minor changes.
 *
 * Revision 1.1  2000/05/06 23:21:49  gerd
 * 	Initial revision.
 *
 *	
 * ======================================================================
 *
 * COPIED FROM REVISION 1.19 OF markup_yacc.mly
 *
 * Revision 1.19  2000/05/01 15:20:08  gerd
 * 	"End tag matches start tag" is checked before "End tag in the
 * same entity as start tag".
 *
 * Revision 1.18  2000/04/30 18:23:08  gerd
 * 	Bigger change: Introduced the concept of virtual roots. First,
 * this reduces the number of checks. Second, it makes it possible to
 * return the virtual root to the caller instead of the real root (new
 * config options 'virtual_root' and 'processing_instructions_inline').
 * 	Minor changes because of better CR/CRLF handling.
 *
 * Revision 1.17  2000/03/13 23:47:46  gerd
 * 	Updated because of interface changes. (See markup_yacc_shadow.mli
 * rev. 1.8)
 *
 * Revision 1.16  2000/01/20 20:54:43  gerd
 * 	New config.errors_with_line_numbers.
 *
 * Revision 1.15  1999/12/17 22:27:58  gerd
 * 	Bugfix: The value of 'p_internal_subset' (an instance
 * variable of the parser object) is to true when the internal subset
 * begins, and is set to false when this subset ends. The error was
 * that references to external entities within this subset did not
 * set 'p_internal_subset' to false; this is now corrected by introducing
 * the 'p_internal_subset_stack'.
 * 	This is a typical example of how the code gets more and
 * more complicated and that it is very difficult to really understand
 * what is going on.
 *
 * Revision 1.14  1999/11/09 22:23:37  gerd
 * 	Removed the invocation of "init_dtd" of the root document.
 * This method is no longer available. The DTD is also passed to the
 * document object by the root element, so nothing essential changes.
 *
 * Revision 1.13  1999/10/25 23:37:09  gerd
 * 	Bugfix: The warning "More than one ATTLIST declaration for element
 * type ..." is only generated if an ATTLIST is found while there are already
 * attributes for the element.
 *
 * Revision 1.12  1999/09/01 23:08:38  gerd
 * 	New frontend function: parse_wf_document. This simply uses
 * a DTD that allows anything, and by the new parameter "extend_dtd" it is
 * avoided that element, attlist, and notation declarations are added to this
 * DTD. The idea is that this function simulates a well-formedness parser.
 * 	Tag_beg, Tag_end carry the entity_id. The "elstack" stores the
 * entity_id of the stacked tag. This was necessary because otherwise there
 * are some examples to produces incorrectly nested elements.
 * 	p_internal_subset is a variable that stores whether the internal
 * subset is being parsed. This is important beacause entity declarations in
 * internal subsets are not allowed to contain parameter references.
 * 	It is checked if the "elstack" is empty after all has been parsed.
 * 	Processing instructions outside DTDs and outside elements are now
 * added to the document.
 * 	The rules of mixed and regexp style content models have been
 * separated. The code is now much simpler.
 * 	Entity references outside elements are detected and rejected.
 *
 * Revision 1.11  1999/09/01 16:26:08  gerd
 * 	Improved the quality of error messages.
 *
 * Revision 1.10  1999/08/31 19:13:31  gerd
 * 	Added checks on proper PE nesting. The idea is that tokens such
 * as Decl_element and Decl_rangle carry an entity ID with them. This ID
 * is simply an object of type < >, i.e. you can only test on identity.
 * The lexer always produces tokens with a dummy ID because it does not
 * know which entity is the current one. The entity layer replaces the dummy
 * ID with the actual ID. The parser checks that the IDs of pairs such as
 * Decl_element and Decl_rangle are the same; otherwise a Validation_error
 * is produced.
 *
 * Revision 1.9  1999/08/15 20:42:01  gerd
 * 	Corrected a misleading message.
 *
 * Revision 1.8  1999/08/15 20:37:34  gerd
 * 	Improved error messages.
 * 	Bugfix: While parsing document entities, the subclass document_entity is
 * now used instead of external_entity. The rules in document entities are a bit
 * stronger.
 *
 * Revision 1.7  1999/08/15 14:03:59  gerd
 * 	Empty documents are not allowed.
 * 	"CDATA section not allowed here" is a WF_error, not a Validation_
 * error.
 *
 * Revision 1.6  1999/08/15 02:24:19  gerd
 * 	Removed some grammar rules that were used for testing.
 * 	Documents without DTD can now have arbitrary elements (formerly
 * they were not allowed to have any element).
 *
 * Revision 1.5  1999/08/14 22:57:20  gerd
 * 	It is allowed that external entities are empty because the
 * empty string is well-parsed for both declarations and contents. Empty
 * entities can be referenced anywhere because the references are replaced
 * by nothing. Because of this, the Begin_entity...End_entity brace is only
 * inserted if the entity is non-empty. (Otherwise references to empty
 * entities would not be allowed anywhere.)
 * 	As a consequence, the grammar has been changed such that a
 * single Eof is equivalent to Begin_entity,End_entity without content.
 *
 * Revision 1.4  1999/08/14 22:20:01  gerd
 *         The "config" slot has now a component "warner" which is
 * an object with a "warn" method. This is used to warn about characters
 * that cannot be represented in the Latin 1 alphabet.
 *         Furthermore, there is a new component "debugging_mode".
 *         Some Parse_error exceptions have been changed into Validation_error.
 *         The interfaces of functions/classes imported from other modules
 * have changed; the invocations have been adapted.
 *         Contents may contain CDATA sections that have been forgotten.
 *
 * Revision 1.3  1999/08/11 15:00:41  gerd
 * 	The Begin_entity ... End_entity brace is also possible in
 * 'contents'.
 * 	The configuration passed to the parsing object contains always
 * the resolver that is actually used.
 *
 * Revision 1.2  1999/08/10 21:35:12  gerd
 * 	The XML/encoding declaration at the beginning of entities is
 * evaluated. In particular, entities have now a method "xml_declaration"
 * which returns the name/value pairs of such a declaration. The "encoding"
 * setting is interpreted by the entity itself; "version", and "standalone"
 * are interpreted by Markup_yacc.parse_document_entity. Other settings
 * are ignored (this does not conform to the standard; the standard prescribes
 * that "version" MUST be given in the declaration of document; "standalone"
 * and "encoding" CAN be declared; no other settings are allowed).
 * 	TODO: The user should be warned if the standard is not exactly
 * fulfilled. -- The "standalone" property is not checked yet.
 *
 * Revision 1.1  1999/08/10 00:35:52  gerd
 * 	Initial revision.
 *
 *
 *)