--- /dev/null
+(* $Id$ -*- tuareg -*-
+ * ----------------------------------------------------------------------
+ * PXP: The polymorphic XML parser for Objective Caml.
+ * Copyright by Gerd Stolpmann. See LICENSE for details.
+ *)
+
+open Parsing
+open Pxp_types
+open Pxp_lexer_types
+open Pxp_dtd
+open Pxp_entity
+open Pxp_document
+open Pxp_aux
+
+(* Some types from the interface definition: *)
+
+exception ID_not_unique
+
+class type [ 'ext ] index =
+object
+ constraint 'ext = 'ext node #extension
+ method add : string -> 'ext node -> unit
+ method find : string -> 'ext node
+end
+
+
+type config =
+ { warner : collect_warnings;
+ errors_with_line_numbers : bool;
+ enable_pinstr_nodes : bool;
+ enable_super_root_node : bool;
+ enable_comment_nodes : bool;
+ encoding : rep_encoding;
+ recognize_standalone_declaration : bool;
+ store_element_positions : bool;
+ idref_pass : bool;
+ validate_by_dfa : bool;
+ accept_only_deterministic_models : bool;
+ debugging_mode : bool;
+ }
+
+type source =
+ Entity of ((dtd -> Pxp_entity.entity) * Pxp_reader.resolver)
+ | ExtID of (ext_id * Pxp_reader.resolver)
+
+
+type start_symbol =
+ Ext_document
+ | Ext_declarations
+ | Ext_element
+
+
+type context =
+ { mutable current : unit -> token; (* get the current token *)
+ mutable get_next : unit -> token; (* go on to the next token; return it *)
+ mutable current_token : token; (* This is the current token *)
+ mutable manager : entity_manager; (* The entity manager *)
+ }
+
+
+let make_context entity_manager =
+ let c =
+ { current = (fun _ -> assert false);
+ get_next = (fun _ -> assert false);
+ current_token = Eof;
+ manager = entity_manager;
+ }
+ in
+ (* Note that the function which is stored in get_next_ref can be changed
+ * as a side-effect when an entity is opened or closed. The function in
+ * c.get_next must be programmed such that always the current "get_next"
+ * function is executed.
+ *)
+ let get_next_ref = entity_manager # yy_get_next_ref in
+ c.current <- (fun () -> c.current_token);
+ c.get_next <- (fun () -> let tok = !get_next_ref() in
+ c.current_token <- tok;
+ tok);
+ ignore(c.get_next());
+ c
+;;
+
+
+let from_channel ?system_encoding ?id:init_id ?fixenc ch =
+
+ (* Reading from a channel works by modifying the algorithm of
+ * resolve_as_file.
+ *)
+
+ let url_syntax = (* A syntax suitable for "file" URLs *)
+ { Neturl.null_url_syntax with
+ Neturl.url_enable_scheme = Neturl.Url_part_allowed;
+ Neturl.url_enable_host = Neturl.Url_part_allowed;
+ Neturl.url_enable_path = Neturl.Url_part_required;
+ Neturl.url_accepts_8bits = true;
+ }
+ in
+
+ let an_url =
+ Neturl.make_url
+ ~scheme: "file"
+ ~host: ""
+ ~path: [ "" ]
+ url_syntax
+ in
+
+ let init_channel_done = ref false in
+ (* Whether the first access to this source has already happened. *)
+
+ (* The task of url_of_id is:
+ * - When it is called the first time, and no init_id is present,
+ * the URL file:/// is passed back (an_url). This forces that
+ * absolute path names /path/dir/... will be interpreted as
+ * file path names. (But relative path names will not work.)
+ * - If an init_id has been passed, we can assume that the opened URL
+ * is exactly this init_id. By raising Not_competent it is indicated
+ * that the standard method is to be used for the interpretation of
+ * the URL.
+ * - Otherwise, the channel is already being read, and thus cannot again
+ * opened. (This case is handled in channel_of_url.)
+ *)
+
+ let url_of_id xid =
+ if !init_channel_done then begin
+ (* Use the normal way of determining the URL of the ID: *)
+ raise Pxp_reader.Not_competent
+ end
+ else begin
+ match init_id with
+ None ->
+ an_url
+ (* If the channel is not associated with any URL: Simply pass
+ * the URL file:/// back.
+ *)
+ | Some the_init_id ->
+ assert (the_init_id = xid);
+ raise Pxp_reader.Not_competent
+ (* If the channel is associated with a URL, the corresponding
+ * ID must be passed when the first invocation happens.
+ *)
+ end
+ in
+
+ (* The task of channel_of_url:
+ * - If it is called the first time ("else"), the channel is returned
+ * - Otherwise, the channel is already being read, and thus cannot again
+ * opened. By raising Not_competent it is signaled that the
+ * resolve_as_file object must not continue to open the URL.
+ *)
+
+ let channel_of_url url =
+ if !init_channel_done then
+ raise Pxp_reader.Not_competent
+ else begin
+ init_channel_done := true;
+ ch, fixenc
+ end
+ in
+
+ let r =
+ new Pxp_reader.resolve_as_file
+ ?system_encoding:system_encoding
+ ~url_of_id:url_of_id
+ ~channel_of_url:channel_of_url
+ ()
+ in
+
+ let init_xid =
+ match init_id with
+ None -> Anonymous
+ | Some id ->
+ (* Note: 'id' may be illegal (malformed); in this case, the first
+ * invocation of url_of_id will raise Not_competent, and the 'open_in'
+ * method will fail.
+ *)
+ id
+ in
+
+ ExtID(init_xid, r)
+;;
+
+
+let from_file ?system_encoding utf8_filename =
+
+ let r =
+ new Pxp_reader.resolve_as_file
+ ?system_encoding:system_encoding
+ ()
+ in
+
+ let utf8_abs_filename =
+ if utf8_filename <> "" && utf8_filename.[0] = '/' then
+ utf8_filename
+ else
+ Sys.getcwd() ^ "/" ^ utf8_filename
+ in
+
+ let syntax = { Neturl.ip_url_syntax with Neturl.url_accepts_8bits = true } in
+ let url = Neturl.make_url
+ ~scheme:"file"
+ ~host:"localhost"
+ ~path:(Neturl.split_path utf8_abs_filename)
+ syntax
+ in
+
+ let xid = System (Neturl.string_of_url url) in
+
+
+ ExtID(xid, r)
+;;
+
+
+let from_string ?fixenc s =
+ let r =
+ new Pxp_reader.resolve_read_this_string ?fixenc:fixenc s in
+ ExtID(Anonymous, r)
+;;
+
+
+(**********************************************************************)
+
+class ['ext] parser_object
+ init_doc init_dtd init_extend_dtd init_config init_resolver init_spec
+ init_process_xmldecl transform_dtd id_index
+ =
+ object (self)
+
+ (* Note that the 'ext parameter has been the motivation to make the
+ * parser a class.
+ *)
+
+ val mutable dtd = init_dtd
+ (* The DTD being parsed; or the DTD currently assumed *)
+
+ val extend_dtd = init_extend_dtd
+ (* Whether the DTD should be extended by ELEMENT, ATTLIST, and
+ * NOTATION declarations or not. (True for validating mode,
+ * false for well-formedness mode.)
+ *)
+
+ val transform_dtd = transform_dtd
+ (* A function transforming the DTD *)
+
+ val id_index = (id_index : 'ext index option)
+ (* The ID index or None *)
+
+ val process_xmldecl = init_process_xmldecl
+ (* Whether the XML declaration is parsed and the found XML version
+ * and standalone declaration are passed to 'doc'.
+ *)
+
+ val lexerset = Pxp_lexers.get_lexer_set (init_config.encoding)
+
+ val doc = init_doc
+ (* The current document *)
+
+ method doc = (doc : 'ext document)
+
+ val resolver = init_resolver
+ (* The resolver for external IDs *)
+
+ val config = init_config
+ (* The current configuration *)
+
+ val elstack = (Stack.create() : ('ext node * entity_id) Stack.t)
+ (* The element stack containing all open elements, i.e. elements that
+ * have begun by a start tag but that have not been finished (end tag).
+ * If the parser sees a start tag, it creates the element and pushes it
+ * on top of this stack. If the parser recognizes an end tag, it pulls
+ * one element from the stack and checks if it has the same name as
+ * given with the end tag.
+ *
+ * At initialization time, a special element is pushed on the stack,
+ * the so-called super root. It is always the bottommost
+ * element of the stack, and serves as a guard.
+ * [See "initializer" below.]
+ *)
+
+ method current =
+ (* Get the top element of the element stack *)
+ try
+ fst(Stack.top elstack)
+ with
+ Stack.Empty -> assert false
+ (* Not possible, because the super root is always the element
+ * at the bottom of the stack.
+ *)
+
+ val mutable n_tags_open = 0
+ (* Number of begin tags that have been parsed and whose corresponding
+ * end tags have not yet been parsed
+ *)
+
+ val mutable p_internal_subset = false
+ (* true while parsing the internal subset - there are some additional
+ * constraints for internal subsets, and because of this it must
+ * be known whether the current declaration is contained in the
+ * internal or external subset of the DTD.
+ *)
+
+ val mutable root = None
+ (* Contains the root element (topmost element) while it is being parsed
+ * and after it has been parsed.
+ * This variable is None before the root element is seen.
+ *)
+
+ method root = root
+
+ val spec = init_spec
+ (* A hashtable that contains exemplar objects for the various element
+ * types. If an element is parsed, the exemplar is looked up and
+ * "cloned" (by the "create" method)
+ *)
+
+ val mutable current_data = []
+ (* Collects character data. *)
+
+ method collect_data s =
+ (* Collects the character material 's' *)
+ current_data <- s :: current_data
+
+ method save_data =
+ (* Puts the material collected in 'current_data' into a new
+ * node, and appends this node as new sub node to 'current'
+ *)
+ match current_data with
+ [] ->
+ ()
+ | [ str ] ->
+ if str <> "" then
+ self # current # add_node (create_data_node spec dtd str);
+ current_data <- []
+ | _ ->
+ let count = List.fold_left
+ (fun acc s -> acc + String.length s)
+ 0
+ current_data in
+ let str = String.create count in
+ let pos = ref count in
+ List.iter
+ (fun s ->
+ let l = String.length s in
+ pos := !pos - l;
+ String.blit
+ ~src:s
+ ~src_pos:0
+ ~dst:str
+ ~dst_pos:(!pos)
+ ~len:l
+ )
+ current_data;
+ assert(!pos = 0);
+ if str <> "" then
+ self # current # add_node (create_data_node spec dtd str);
+ current_data <- []
+
+
+ method only_whitespace data =
+ (* Checks that the string "data" contains only whitespace. On failure,
+ * Validation_error is raised.
+ *)
+ let lexbuf = Lexing.from_string data in
+ let t1 = lexerset.scan_name_string lexbuf in
+ if t1 <> Ignore then
+ raise(WF_error("Data not allowed here"));
+ let t2 = lexerset.scan_name_string lexbuf in
+ if t2 <> Eof then
+ raise(WF_error("Data not allowed here"));
+ ()
+
+ initializer
+ (* CHECKS: *)
+ if config.encoding <> dtd # encoding then
+ failwith("Encoding mismatch");
+
+ (* --- Initialize 'elstack': Push the super-root on the stack. *)
+ let super_root =
+ if config.enable_super_root_node then
+ create_super_root_node spec dtd
+ else
+ (* because spec may not contain an exemplar for the super root: *)
+ create_no_node spec dtd
+ in
+ (* Move the super root or the emulation to the stack: *)
+ Stack.push (super_root, (self :> entity_id)) elstack;
+
+
+
+ (********* Here the method "parse" begins. The grammar below is
+ * transformed to a local function of this method
+ *)
+
+ method parse context start_symbol =
+
+ let parse_ignored_section yy_current yy_get_next =
+ (* A special parser which should be used after <![IGNORE[.
+ * It parses until the corresponding ]]> is found.
+ *)
+
+ while yy_current() = Ignore do
+ ignore(yy_get_next());
+ done;
+
+ ( match yy_current() with
+ Conditional_body _ -> ()
+ | _ -> raise Parsing.Parse_error;
+ );
+
+ let en = context.manager # current_entity in
+ let llev = ref 1 in
+ while !llev >= 1 do
+ let igntok = en # next_ignored_token in
+ (* next_ignored_token: uses a special lexer that only
+ * recognizes Conditional_begin and Conditional_end;
+ * other character combinations are ignored.
+ *)
+ (* NOTE: next_ignored_token works much like yy_get_next,
+ * but it does not set the current token!
+ *)
+ match igntok with
+ Conditional_begin _ ->
+ llev := !llev + 1
+ | Conditional_end _ ->
+ llev := !llev - 1;
+ (* Because the loop may be exited now: *)
+ context.current_token <- igntok;
+ | (End_entity | Eof) ->
+ raise Parsing.Parse_error
+ | _ ->
+ ()
+ done;
+
+ in
+
+
+ let check_and_parse_xmldecl xmldecl =
+ if process_xmldecl then begin
+ let v, _, s = decode_doc_xml_pi (decode_xml_pi xmldecl) in
+ check_version_num v;
+ doc # init_xml_version v;
+ let v = match s with
+ None -> false
+ | Some "yes" -> true
+ | Some "no" -> false
+ | _ -> raise (WF_error("Illegal 'standalone' declaration"))
+ in
+ if config.recognize_standalone_declaration then
+ dtd # set_standalone_declaration v
+ end
+ in
+
+ let recode_utf8 s =
+ (* Recode 's' to UTF-8 *)
+ if config.encoding = `Enc_utf8 then
+ s (* No recoding necessary *)
+ else
+ Netconversion.recode_string
+ ~in_enc:(config.encoding :> encoding) ~out_enc:`Enc_utf8 s
+ in
+
+
+%%
+
+/* The following grammar looks similar to ocamlyacc grammars, but
+ * ocamlyacc is actually not used to transform the grammar into a parser.
+ * Instead, the parser generator m2parsergen is applied.
+ *
+ * The format of the grammar is different (see m2parsergen/README),
+ * but I hope that you can understand most features immediately.
+ *
+ * The type of the parser is different: m2parsergen creates a top-down
+ * parser while ocamlyacc generates a LALR-1 parser.
+ *
+ * The way the generated code is called is different: ocamlyacc produces
+ * lots of top-level definitions whereas m2parsergen generates only
+ * a local let-in-phrase. This is explained in the already mentioned
+ * README file.
+ */
+
+/* See Pxp_types.ml for comments to the various tokens */
+
+%token Begin_entity
+%token End_entity
+%token Comment_begin
+%token Comment_end
+%token Ignore
+%token Eq
+%token Rangle
+%token Rangle_empty
+%token <> Conditional_begin
+%token <> Conditional_body
+%token <> Conditional_end
+%token Percent
+%token Plus
+%token Star
+%token Bar
+%token Comma
+%token Qmark
+%token Pcdata
+%token Required
+%token Implied
+%token Fixed
+%token Eof
+
+%token <> Comment_material
+%token <> Doctype
+%token <> Doctype_rangle
+%token <> Dtd_begin
+%token <> Dtd_end
+%token <> Decl_element
+%token <> Decl_attlist
+%token <> Decl_entity
+%token <> Decl_notation
+%token <> Decl_rangle
+%token <> Lparen
+%token <> Rparen
+%token <> RparenPlus
+%token <> RparenStar
+%token <> RparenQmark
+
+%token <> Tag_beg
+%token <> Tag_end
+
+%token <> PI
+%token <> PI_xml
+%token <> Cdata
+%token <> CRef
+%token <> ERef
+%token <> PERef
+%token <> CharData
+%token <> LineEnd
+%token <> Name
+%token <> Nametoken
+%token <> Attval
+%token <> Attval_nl_normalized
+%token <> Unparsed_string
+
+/* START SYMBOLS:
+ *
+ * "ext_document": parses a complete XML document (i.e. containing a
+ * <!DOCTYPE..> and an element)
+ * "ext_declarations": parses an "external DTD subset", i.e. a sequence
+ * of declarations
+ * "ext_element": parses a single element (no <!DOCTYPE...> allowed);
+ * the element needs not to be the root element of the
+ * DTD
+ *
+ * The functions corresponding to these symbols return always () because
+ * they only have side-effects.
+ */
+
+/* SOME GENERAL COMMENTS:
+ *
+ * The parser does not get its tokens from the lexers directly. Instead of
+ * this, there is an entity object between the parser and the lexers. This
+ * object already handles:
+ *
+ * - References to general and parameter entities. The token stream is
+ * modified such that tokens automatically come from the referenced entities.
+ * External parameter entities and all general entities are embraced by
+ * the two special tokens Begin_entity and End_entity. The parser must
+ * check that these braces are correctly nested.
+ */
+
+%%
+
+
+ext_document():
+ Begin_entity
+ doc_xmldecl_then_misc_then_prolog_then_rest() End_entity
+ {{
+ if n_tags_open <> 0 then
+ raise(WF_error("Missing end tag"))
+ }}
+
+
+/* In the following rule, we must find out whether there is an XML declaration
+ * or not, and directly after that either "process_xmldecl" or
+ * "process_missing_xmldecl" of the current entity must be called.
+ * AND IT MUST BE DIRECTLY! Because of this, the invocation is carried out
+ * in the "$" clause immediately following the first token.
+ *
+ * TODO: This is not enough. The first token may be a tag, and the tag
+ * may already contain non-ASCII characters. (But in this case, the resolvers
+ * assume UTF8, and they are right...)
+ */
+
+doc_xmldecl_then_misc_then_prolog_then_rest():
+ pl:PI_xml
+ $ {{ context.manager # current_entity # process_xmldecl pl;
+ check_and_parse_xmldecl pl;
+ }}
+ misc()* doc_prolog_then_rest()
+ {{ () }}
+
+| $ {{ context.manager # current_entity # process_missing_xmldecl; }}
+ misc() misc()* doc_prolog_then_rest()
+ {{ () }}
+
+| $ {{ context.manager # current_entity # process_missing_xmldecl; }}
+ doctypedecl() misc()* contents_start()
+ {{ () }}
+
+| $ {{ context.manager # current_entity # process_missing_xmldecl; }}
+ contents_start()
+ {{ () }}
+
+
+doc_prolog_then_rest():
+ doctypedecl() misc()* contents_start()
+ {{ () }}
+| contents_start()
+ {{ () }}
+
+
+ext_element():
+ Begin_entity el_xmldecl_then_misc_then_rest() End_entity
+ {{
+ if n_tags_open <> 0 then
+ raise(WF_error("Missing end tag"))
+ }}
+
+
+/* See comment for doc_mldecl_then_misc_then_prolog_then_rest. */
+
+el_xmldecl_then_misc_then_rest():
+ pl:PI_xml
+ $ {{ context.manager # current_entity # process_xmldecl pl; }}
+ misc()* contents_start()
+ {{ () }}
+
+| $ {{ context.manager # current_entity # process_missing_xmldecl; }}
+ misc() misc()* contents_start()
+ {{ () }}
+
+| $ {{ context.manager # current_entity # process_missing_xmldecl; }}
+ contents_start()
+ {{ () }}
+
+
+ext_declarations():
+ /* Parses a sequence of declarations given by an entity. As side-effect,
+ * the parsed declarations are put into the dtd object.
+ */
+ Begin_entity decl_xmldecl_then_rest()
+ {{ () }}
+| Eof
+ {{ () }}
+
+
+decl_xmldecl_then_rest():
+ /* Note: This rule is also called from declaration()! */
+ pl:PI_xml
+ $ {{ context.manager # current_entity # process_xmldecl pl;
+ }}
+ declaration()* End_entity
+ {{ () }}
+
+| $ {{ context.manager # current_entity # process_missing_xmldecl; }}
+ declaration() declaration()* End_entity
+ {{ () }}
+
+| $ {{ context.manager # current_entity # process_missing_xmldecl; }}
+ End_entity
+ {{ () }}
+
+
+misc():
+ pi()
+ {{ () }}
+| data: CharData
+ /* In this context, the lexers sometimes do not recognize white space;
+ * instead CharData tokens containing white space are delivered.
+ */
+ {{ self # only_whitespace data }}
+| Ignore
+ {{ () }}
+| comment()
+ {{ () }}
+
+
+/********************* DOCUMENT TYPE DECLARATION *************************/
+
+doctypedecl():
+ /* parses from <!DOCTYPE to >. As side-effect, first the declarations of
+ * the internal DTD (if any) are put into !!on_dtd, then the declarations
+ * of the external DTD (if any) are put into this DTD object.
+ */
+ doctype_entid: Doctype
+ ws: Ignore Ignore*
+ doctypedecl_material (doctype_entid)
+ {{ () }}
+ ? {{ match !yy_position with
+ "ws" -> raise(WF_error("Whitespace is missing after `DOCTYPE'"))
+ | _ -> raise(WF_error("Bad DOCTYPE declaration"))
+ }}
+
+
+/* TRICK:
+ * ws: Ignore? Ignore*
+ * is meant seriously. The effect is that ws becomes a boolean variable
+ * which is true if there is an Ignore token and false otherwise.
+ * This construct is faster than just
+ * ws: Ignore*
+ * in which case ws becomes an integer variable containing the number of
+ * Ignore tokens. Counting the number of tokens is slower than only checking
+ * the existence.
+ *
+ * We need the information whether there is an Ignore token (representing
+ * white space), because white space is only obligatory if also an identifier
+ * for the external subset is parsed; this conditional syntax constraint is
+ * simply programmed in the body of the grammar rule.
+ */
+
+doctypedecl_material(doctype_entid):
+ root_name: Name
+ ws: Ignore? Ignore*
+ external_subset: external_id()?
+ Ignore*
+ internal_subset: internal_dtd()?
+ Ignore*
+ doctype_rangle_entid: Doctype_rangle
+ {{
+ if doctype_entid != doctype_rangle_entid then
+ raise (Validation_error("Entities not properly nested with DOCTYPE declaration"));
+ dtd # set_root root_name;
+ begin match external_subset, internal_subset with
+ None, None -> () (* no DTD means no ID *)
+ | None, Some _ -> dtd # set_id Internal
+ | Some id, None -> dtd # set_id (External id)
+ | Some id, Some _ -> dtd # set_id (Derived id)
+ end;
+ (* Get now the external doctype declaration. Note that the internal
+ * subset has precedence and must be read first.
+ *)
+ begin match external_subset with
+ None -> ()
+ | Some id ->
+ if not ws then
+ raise(WF_error("Whitespace is missing after `DOCTYPE " ^
+ root_name ^ "'"));
+ let r' = resolver # clone in
+ let pobj =
+ new parser_object
+ (new document config.warner)
+ dtd
+ extend_dtd
+ config
+ r'
+ spec
+ process_xmldecl
+ (fun x -> x)
+ None
+ in
+ let en = new external_entity r' dtd "[dtd]"
+ config.warner id false config.errors_with_line_numbers
+ config.encoding
+ in
+ en # set_debugging_mode (config.debugging_mode);
+ let mgr = new entity_manager en in
+ en # open_entity true Declaration;
+ try
+ let context = make_context mgr in
+ pobj # parse context Ext_declarations;
+ ignore(en # close_entity);
+ with
+ error ->
+ ignore(en # close_entity);
+ r' # close_all;
+ let pos = mgr # position_string in
+ raise (At(pos, error))
+ end;
+ dtd # validate
+ }}
+ ? {{
+ match !yy_position with
+ "doctype_rangle_entid" -> raise(WF_error("`>' expected"))
+ | _ -> raise(WF_error("Bad DOCTYPE declaration"))
+ }}
+
+/* Note that there are no keywords for SYSTEM or PUBLIC, as these would
+ * be difficult to recognize in the lexical contexts. Because of this,
+ * SYSTEM/PUBLIC is parsed as name, and the rule for everything after
+ * SYSTEM/PUBLIC is computed dynamically.
+ */
+
+external_id():
+ tok:Name
+ $ {{
+ let followup =
+ match tok with
+ "SYSTEM" -> parse_system_id
+ (* Apply the rule system_id (below) to parse the
+ * rest of the ID
+ *)
+ | "PUBLIC" -> parse_public_id
+ (* Apply the rule public_id (below) to parse the
+ * rest of the ID
+ *)
+ | _ -> raise(WF_error("SYSTEM or PUBLIC expected"))
+ in
+ }}
+ ws:Ignore Ignore*
+ r:[followup]()
+ {{ r }}
+ ? {{ match !yy_position with
+ "ws" -> raise(WF_error("Whitespace is missing after " ^ tok))
+ | _ -> raise(WF_error("Bad SYSTEM or PUBLIC identifier"))
+ }}
+
+
+system_id():
+ str:Unparsed_string
+ {{ System (recode_utf8 str) }}
+
+
+public_id():
+ str1: Unparsed_string
+ ws: Ignore Ignore*
+ str2: Unparsed_string
+ {{ check_public_id str1;
+ Public(recode_utf8 str1, recode_utf8 str2)
+ }}
+ ? {{ match !yy_position with
+ "ws" -> raise(WF_error("Whitespace is missing between the literals of the PUBLIC identifier"))
+ | _ -> raise(WF_error("Bad PUBLIC identifier"))
+ }}
+
+
+/* The internal subset: "[" declaration* "]". While parsing the declarations
+ * the object variable p_internal_subset must be true; however, if there
+ * are entity references, this variable must be reset to false during
+ * the entity. (See the rule for "declaration" below.)
+ */
+
+internal_dtd():
+ dtd_begin_entid: internal_dtd_begin()
+ declaration()*
+ dtd_end_entid: internal_dtd_end()
+ {{
+ if dtd_begin_entid != dtd_end_entid then
+ raise(Validation_error("Entities not properly nested with internal DTD subset"))
+ }}
+ ? {{ match !yy_position with
+ "dtd_end_entid" -> raise(WF_error("`]' expected"))
+ | _ -> raise(WF_error("Bad internal DTD subset"))
+ }}
+
+
+internal_dtd_begin():
+ Dtd_begin
+ {{ assert (not p_internal_subset);
+ p_internal_subset <- true }}
+
+
+internal_dtd_end():
+ Dtd_end
+ {{ assert p_internal_subset;
+ p_internal_subset <- false }}
+
+
+declaration():
+ /* Parses a single declaration (or processing instruction). As side-effect
+ * the parsed declaration is stored into the dtd object.
+ */
+ elementdecl()
+ {{ () }}
+| attlistdecl()
+ {{ () }}
+| entid:Decl_entity ws:Ignore Ignore* e:entitydecl(entid)
+ {{ () }}
+ ? {{ match !yy_position with
+ "ws" -> raise(WF_error("Whitespace is missing after ENTITY"))
+ | "e" -> raise(WF_error("Name or `%' expected"))
+ | _ -> raise(WF_error("Bad entity declaration"))
+ }}
+| notationdecl()
+ {{ () }}
+| pi: PI
+ {{ let target, value = pi in
+ let pi = new proc_instruction target value config.encoding in
+ dtd # add_pinstr pi
+ }}
+| Ignore
+ {{ () }}
+| Comment_begin Comment_material* ce:Comment_end
+ {{ () }}
+ ? {{ match !yy_position with
+ "ce" -> raise(WF_error("`-->' expected"))
+ | _ -> raise(WF_error("Bad comment"))
+ }}
+| Begin_entity
+ $ {{ (* Set 'p_internal_subset' to 'false' until the matching 'end_entity'
+ * rule is parsed. This allows unrestricted usage of parameter entities
+ * within declarations of internal entities.
+ *)
+ let old_p_internal_subset = p_internal_subset in
+ p_internal_subset <- false;
+ }}
+ decl_xmldecl_then_rest()
+ {{ (* Restore the old value of 'p_internal_subset'. *)
+ p_internal_subset <- old_p_internal_subset;
+ ()
+ }}
+| begin_entid:Conditional_begin
+ $ {{ (* Check whether conditional sections are allowed at this position. *)
+ if p_internal_subset then
+ raise(WF_error("Restriction of the internal subset: Conditional sections not allowed"));
+ }}
+ Ignore*
+ cond:conditional_section() end_entid:Conditional_end
+ {{ (* Check whether Conditional_begin and Conditional_end are in the same
+ * entity. (This restriction is explained in the file SPECS.)
+ *)
+ if begin_entid != end_entid then
+ raise(Validation_error("The first and the last token of conditional sections must be in the same entity (additional restriction of this parser)"));
+ }}
+ ? {{ match !yy_position with
+ "end_entid" -> raise(WF_error("`>]>' expected"))
+ | "cond" -> raise(WF_error("INCLUDE or IGNORE expected"))
+ | _ -> raise(WF_error("Bad conditional section"))
+ }}
+
+/* The tokens INCLUDE/IGNORE are scanned as names, and the selection of the
+ * right parsing rule is dynamic.
+ * Note that parse_ignored_section is not defined by a grammar rule but
+ * by a conventional let-binding above.
+ */
+
+conditional_section():
+ include_or_ignore:Name
+ $ {{ let parsing_function =
+ match include_or_ignore with
+ "INCLUDE" -> parse_included_section
+ (* invoke rule "included_section" below *)
+ | "IGNORE" -> parse_ignored_section
+ (* invoke function "parse_ignored_section" *)
+ | _ -> raise(WF_error("INCLUDE or IGNORE expected"))
+ in
+ }}
+ [ parsing_function ] ()
+ {{ () }}
+ ? {{ raise(WF_error("Bad conditional section")) }}
+
+included_section():
+ Conditional_body declaration()*
+ {{ () }}
+| Ignore Ignore* Conditional_body declaration()*
+ {{ () }}
+
+
+/*************************** ELEMENT DECLARATIONS ********************/
+
+elementdecl():
+ /* parses <!ELEMENT ... >. Puts the parsed element type as side-effect into
+ * dtd.
+ */
+ decl_element_entid: Decl_element
+ $ {{ let extdecl = context.manager # current_entity_counts_as_external in
+ }}
+ ws1: Ignore Ignore*
+ name: Name
+ ws2: Ignore Ignore*
+ content_model: contentspec()
+ Ignore*
+ decl_rangle_entid: Decl_rangle
+ {{
+ if decl_element_entid != decl_rangle_entid then
+ raise (Validation_error "Entities not properly nested with ELEMENT declaration");
+ if extend_dtd then begin
+ let el = new dtd_element dtd name in
+ (* It is allowed that an <!ATTLIST...> precedes the corresponding
+ * <!ELEMENT...>. Because of this it is possible that there is already
+ * an element called 'name' in the DTD, and we only must set the content
+ * model of this element.
+ *)
+ try
+ dtd # add_element el;
+ el # set_cm_and_extdecl content_model extdecl;
+ with
+ Not_found -> (* means: there is already an element 'name' *)
+ let el' = dtd # element name in
+ el' # set_cm_and_extdecl content_model extdecl;
+ (* raises Validation_error if el' already has a content model *)
+ end
+ }}
+ ? {{ match !yy_position with
+ ("ws1"|"ws2") -> raise(WF_error("Whitespace is missing"))
+ | "name" -> raise(WF_error("The name of the element is expected here"))
+ | "content_model" -> raise(WF_error("Content model expression expected"))
+ | "decl_rangle_entid" -> raise(WF_error("`>' expected"))
+ | _ -> raise(WF_error("Bad element type declaration"))
+ }}
+
+contentspec():
+ /* parses a content model and returns it (type content_model_type) */
+ name: Name /* EMPTY or ANY */
+ {{ match name with
+ "EMPTY" -> Empty
+ | "ANY" -> Any
+ | _ -> raise(WF_error("EMPTY, ANY, or a subexpression expected"))
+ }}
+| entid:Lparen Ignore* term:mixed_or_regexp(entid)
+ {{ term }}
+ ? {{ raise(WF_error("Bad content model expression")) }}
+
+
+/* Many of the following rules have an lparen_entid argument. This is the
+ * internal ID of the entity containing the corresponding left parenthesis;
+ * by comparing it with the ID of the entity of the right parenthesis the
+ * contraint is implemented that both parentheses must be in the same entity.
+ */
+
+mixed_or_regexp(lparen_entid):
+ re: choice_or_seq(lparen_entid)
+ {{ Regexp re }}
+| m: mixed(lparen_entid)
+ {{ m }}
+
+
+multiplier():
+ /* returns one of the multiplier symbols (?,*,+) */
+ Plus
+ {{ Plus }}
+| Star
+ {{ Star }}
+| Qmark
+ {{ Qmark }}
+
+
+mixed (lparen_entid) :
+ Pcdata
+ Ignore*
+ material: mixed_alternatives_top()
+ {{
+ let rest, rparen_entid = material in
+ if lparen_entid != rparen_entid then
+ raise (Validation_error "Entities not properly nested with parentheses");
+ Mixed (MPCDATA :: rest)
+ }}
+ ? {{ raise(WF_error("Bad content model expression")) }}
+
+
+mixed_alternatives_top():
+ entid: Rparen
+ {{ [], entid }}
+| entid: RparenStar
+ {{ [], entid }}
+| Bar Ignore* name:Name Ignore* names:mixed_alternative()* entid:RparenStar
+ {{
+ (MChild name :: names), entid
+ }}
+ ? {{ match !yy_position with
+ "name" -> raise(WF_error("Name expected"))
+ | "entid" -> raise(WF_error("`)*' expected"))
+ | _ -> raise(WF_error("Bad content model expression"))
+ }}
+
+
+mixed_alternative() :
+ Bar Ignore* name:Name Ignore*
+ {{ MChild name }}
+ ? {{ match !yy_position with
+ "name" -> raise(WF_error("Name expected"))
+ | _ -> raise(WF_error("Bad content model expression"))
+ }}
+
+
+
+choice_or_seq (lparen_entid):
+ /* parses either a regular expression, or a mixed expression. Returns
+ * Mixed spec or Regexp spec (content_model_type).
+ * Which kind of expression (regexp or mixed) is being read is recognized
+ * after the first subexpression has been parsed; the other subexpressions
+ * must be of the same kind.
+ */
+ re: cp()
+ Ignore*
+ factor: choice_or_seq_factor()
+ {{
+ let (finalmark,subexpr), rparen_entid = factor in
+ if lparen_entid != rparen_entid then
+ raise (Validation_error "Entities not properly nested with parentheses");
+ (* Check that the other subexpressions are "regexp", too, and
+ * merge them with the first.
+ *)
+ let re' =
+ match subexpr with
+ Alt [] -> re
+ | Alt alt -> Alt (re :: alt)
+ | Seq seq -> Seq (re :: seq)
+ | _ -> assert false
+ in
+ (* Interpret the finalmark. *)
+ match finalmark with
+ Ignore -> re'
+ | Plus -> Repeated1 re'
+ | Star -> Repeated re'
+ | Qmark -> Optional re'
+ | _ -> assert false
+ }}
+ ? {{ raise(WF_error("Bad content model expression")) }}
+
+choice_or_seq_factor():
+ /* Parses "|<subexpr>|...)" or ",<subexpr>,...)", both forms optionally
+ * followed by ?, *, or +.
+ * Returns ((finalmark, expr), rparen_entid), where
+ * - finalmark is the character after the right parenthesis or Ignore
+ * - expr is either
+ * Alt [] meaning that only ")" has been found
+ * Alt non_empty_list meaning that the subexpressions are separated by '|'
+ * Seq non_empty_list meaning that the subexpressions are separated by ','
+ */
+ entid:Rparen
+ {{ (Ignore, Alt []), entid }}
+| entid:RparenPlus
+ {{ (Plus, Alt []), entid }}
+| entid:RparenStar
+ {{ (Star, Alt []), entid }}
+| entid:RparenQmark
+ {{ (Qmark, Alt []), entid }}
+| Bar Ignore* re:cp() Ignore* factor:choice_or_seq_factor()
+ {{
+ let (finalmark, subexpr), rparen_entid = factor in
+ begin match subexpr with
+ Alt [] -> (finalmark, (Alt [re])), rparen_entid
+ | Alt alt -> (finalmark, (Alt (re :: alt))), rparen_entid
+ | _ -> raise(WF_error("It is not allowed to mix alternatives and sequences"))
+ end
+ }}
+ ? {{ raise(WF_error("Bad content model expression")) }}
+| Comma Ignore* re:cp() Ignore* factor:choice_or_seq_factor()
+ {{
+ let (finalmark, subexpr), rparen_entid = factor in
+ begin match subexpr with
+ Alt [] -> (finalmark, (Seq [re])), rparen_entid
+ | Seq seq -> (finalmark, (Seq (re :: seq))), rparen_entid
+ | _ -> raise(WF_error("It is not allowed to mix alternatives and sequences"))
+ end
+ }}
+ ? {{ raise(WF_error("Bad content model expression")) }}
+
+cp():
+ /* parse either a name, or a parenthesized subexpression "(...)" */
+ name:Name m:multiplier()?
+ {{ match m with
+ None -> Child name
+ | Some Plus -> Repeated1 (Child name)
+ | Some Star -> Repeated (Child name)
+ | Some Qmark -> Optional (Child name)
+ | _ -> assert false
+ }}
+ ? {{ raise(WF_error("Bad content model expression")) }}
+| entid:Lparen Ignore* m:choice_or_seq(entid)
+ {{ m }}
+ ? {{ raise(WF_error("Bad content model expression")) }}
+
+
+/********************* ATTRIBUTE LIST DECLARATION ***********************/
+
+attlistdecl():
+ /* parses <!ATTLIST ... >. Enters the attribute list in dtd as side-
+ * effect.
+ */
+ decl_attlist_entid: Decl_attlist
+ $ {{ let extdecl = context.manager # current_entity_counts_as_external in
+ }}
+ ws1: Ignore Ignore*
+ el_name: Name
+ ws: Ignore? Ignore*
+ factor: attdef_factor()
+ {{
+ let at_list, decl_rangle_entid = factor in
+
+ if decl_attlist_entid != decl_rangle_entid then
+ raise (Validation_error "Entities not properly nested with ATTLIST declaration");
+
+ if not ws && at_list <> [] then begin
+ match at_list with
+ (name,_,_) :: _ ->
+ (* This is normally impossible, because the lexer demands
+ * some other token between two names.
+ *)
+ raise(WF_error("Whitespace is missing before `" ^ name ^ "'"));
+ | _ -> assert false
+ end;
+
+ if extend_dtd then begin
+ let new_el = new dtd_element dtd el_name in
+ (* Note that it is allowed that <!ATTLIST...> precedes the corresponding
+ * <!ELEMENT...> declaration. In this case we add the element declaration
+ * already to the DTD but leave the content model unspecified.
+ *)
+ let el =
+ try
+ dtd # add_element new_el;
+ new_el
+ with
+ Not_found -> (* already added *)
+ let old_el = dtd # element el_name in
+ if old_el # attribute_names <> [] then
+ config.warner # warn ("More than one ATTLIST declaration for element type `" ^
+ el_name ^ "'");
+ old_el
+ in
+ List.iter
+ (fun (a_name, a_type, a_default) ->
+ el # add_attribute a_name a_type a_default extdecl)
+ at_list
+ end
+ }}
+ ? {{ match !yy_position with
+ "ws1" -> raise(WF_error("Whitespace is missing after ATTLIST"))
+ | "el_name" -> raise(WF_error("The name of the element is expected here"))
+ | "factor" -> raise(WF_error("Another attribute name or `>' expected"))
+ | _ -> raise(WF_error("Bad attribute declaration"))
+ }}
+
+
+attdef_factor():
+ /* parses a list of triples <name> <type> <default value> and returns the
+ * list as (string * att_type * att_default) list.
+ */
+ attdef:attdef() ws:Ignore? Ignore* factor:attdef_factor()
+ {{
+ let attdef_rest, decl_rangle_entid = factor in
+ if not ws && attdef_rest <> [] then begin
+ match attdef_rest with
+ (name,_,_) :: _ ->
+ raise(WF_error("Missing whitespace before `" ^ name ^ "'"));
+ | _ -> assert false
+ end;
+ (attdef :: attdef_rest), decl_rangle_entid }}
+ ? {{ match !yy_position with
+ | "factor" -> raise(WF_error("Another attribute name or `>' expected"))
+ | _ -> raise(WF_error("Bad attribute declaration"))
+ }}
+| entid:Decl_rangle
+ {{ [], entid }}
+
+
+attdef():
+ /* Parses a single triple */
+ name: Name
+ ws1: Ignore Ignore*
+ tp: atttype()
+ ws2: Ignore Ignore*
+ default: defaultdecl()
+ {{ (name,tp,default) }}
+ ? {{ match !yy_position with
+ ("ws1"|"ws2") -> raise(WF_error("Whitespace is missing"))
+ | "tp" -> raise(WF_error("Type of attribute or `(' expected"))
+ | "default" -> raise(WF_error("#REQUIRED, #IMPLIED, #FIXED or a string literal expected"))
+ | _ -> raise(WF_error("Bad attribute declaration"))
+ }}
+
+atttype():
+ /* Parses an attribute type and returns it as att_type. */
+ name: Name
+ $ {{ let followup =
+ if name = "NOTATION" then
+ parse_notation
+ else
+ parse_never
+ in
+ }}
+ nota: [followup]()?
+ {{
+ match name with
+ "CDATA" -> A_cdata
+ | "ID" -> A_id
+ | "IDREF" -> A_idref
+ | "IDREFS" -> A_idrefs
+ | "ENTITY" -> A_entity
+ | "ENTITIES" -> A_entities
+ | "NMTOKEN" -> A_nmtoken
+ | "NMTOKENS" -> A_nmtokens
+ | "NOTATION" ->
+ (match nota with
+ None -> raise(WF_error("Error in NOTATION type (perhaps missing whitespace after NOTATION?)"))
+ | Some n -> n
+ )
+ | _ -> raise(WF_error("One of CDATA, ID, IDREF, IDREFS, ENTITY, ENTITIES, NMTOKEN, NMTOKENS, NOTATION, or a subexpression expected"))
+ }}
+ ? {{ raise(WF_error("Bad attribute declaration (perhaps missing whitespace after NOTATION)")) }}
+
+| Lparen
+ Ignore*
+ name: name_or_nametoken()
+ Ignore*
+ names: nmtoken_factor()*
+ rp: Rparen
+ /* Enumeration */
+ {{ A_enum(name :: names) }}
+ ? {{ match !yy_position with
+ "name" -> raise(WF_error("Name expected"))
+ | "names" -> raise(WF_error("`|' and more names expected, or `)'"))
+ | "rp" -> raise(WF_error("`|' and more names expected, or `)'"))
+ | _ -> raise(WF_error("Bad enumeration type"))
+ }}
+
+
+never():
+ /* The always failing rule */
+ $ {{ raise Not_found; }}
+ Doctype /* questionable */
+ {{ A_cdata (* Does not matter *)
+ }}
+
+
+notation():
+ Ignore Ignore*
+ lp: Lparen
+ Ignore*
+ name: Name
+ Ignore*
+ names: notation_factor()*
+ rp: Rparen
+ {{ A_notation(name :: names) }}
+ ? {{ match !yy_position with
+ "lp" -> raise(WF_error("`(' expected"))
+ | "name" -> raise(WF_error("Name expected"))
+ | "names" -> raise(WF_error("`|' and more names expected, or `)'"))
+ | "rp" -> raise(WF_error("`|' and more names expected, or `)'"))
+ | _ -> raise(WF_error("Bad NOTATION type"))
+ }}
+
+
+notation_factor():
+ /* Parse "|<name>" and return the name */
+ Bar Ignore* name:Name Ignore*
+ {{ name }}
+ ? {{ match !yy_position with
+ "name" -> raise(WF_error("Name expected"))
+ | _ -> raise(WF_error("Bad NOTATION type"))
+ }}
+
+nmtoken_factor():
+ /* Parse "|<nmtoken>" and return the nmtoken */
+ Bar Ignore* n:name_or_nametoken() Ignore*
+ {{ n }}
+ ? {{ match !yy_position with
+ "n" -> raise(WF_error("Nametoken expected"))
+ | _ -> raise(WF_error("Bad enumeration type"))
+ }}
+
+
+name_or_nametoken():
+ n:Name {{ n }}
+| n:Nametoken {{ n }}
+
+
+/* The default values must be expanded and normalized. This has been implemented
+ * by the function expand_attvalue.
+ */
+
+
+defaultdecl():
+ /* Parse the default value for an attribute and return it as att_default */
+ Required
+ {{ D_required }}
+| Implied
+ {{ D_implied }}
+| Fixed ws:Ignore Ignore* str:Unparsed_string
+ {{ D_fixed (expand_attvalue lexerset dtd str config.warner false) }}
+ ? {{ match !yy_position with
+ "ws" -> raise(WF_error("Whitespace is missing after #FIXED"))
+ | "str" -> raise(WF_error("String literal expected"))
+ | _ -> raise(WF_error("Bad #FIXED default value"))
+ }}
+| str:Unparsed_string
+ {{ D_default (expand_attvalue lexerset dtd str config.warner false) }}
+
+
+/**************************** ENTITY DECLARATION ***********************/
+
+entitydecl(decl_entity_entid):
+ /* parses everything _after_ <!ENTITY until the matching >. The parsed
+ * entity declaration is entered into the dtd object as side-effect.
+ */
+ name: Name
+ $ {{ let extdecl = context.manager # current_entity_counts_as_external in
+ }}
+ ws: Ignore Ignore*
+ material: entitydef()
+ Ignore*
+ decl_rangle_entid: Decl_rangle
+ /* A general entity */
+ {{
+ if decl_entity_entid != decl_rangle_entid then
+ raise (Validation_error "Entities not properly nested with ENTITY declaration");
+ let en =
+ (* Distinguish between
+ * - internal entities
+ * - external entities
+ * - NDATA (unparsed) entities
+ *)
+ match material with
+ (Some s, None, None) ->
+ new internal_entity dtd name config.warner s p_internal_subset
+ config.errors_with_line_numbers false config.encoding
+ | (None, Some xid, None) ->
+ new external_entity (resolver # clone) dtd name config.warner
+ xid false config.errors_with_line_numbers
+ config.encoding
+
+ | (None, Some xid, Some n) ->
+ (new ndata_entity name xid n config.encoding :> entity)
+ | _ -> assert false
+ in
+ dtd # add_gen_entity en extdecl
+ }}
+ ? {{ match !yy_position with
+ "ws" -> raise(WF_error("Whitespace is missing"))
+ | "material" -> raise(WF_error("String literal or identifier expected"))
+ | "decl_rangle_entid" -> raise(WF_error("`>' expected"))
+ | _ -> raise(WF_error("Bad entity declaration"))
+ }}
+
+| Percent
+ $ {{ let extdecl = context.manager # current_entity_counts_as_external in
+ }}
+ ws1: Ignore Ignore*
+ name: Name
+ ws2: Ignore Ignore*
+ material: pedef()
+ Ignore*
+ decl_rangle_entid: Decl_rangle
+ /* A parameter entity */
+ {{
+ if decl_entity_entid != decl_rangle_entid then
+ raise (Validation_error "Entities not properly nested with ENTITY declaration");
+ let en =
+ (* Distinguish between internal and external entities *)
+ match material with
+ (Some s, None) ->
+ new internal_entity dtd name config.warner s p_internal_subset
+ config.errors_with_line_numbers true config.encoding
+ | (None, Some xid) ->
+ new external_entity (resolver # clone) dtd name config.warner
+ xid true config.errors_with_line_numbers
+ config.encoding
+ | _ -> assert false
+ in
+
+ (* The following two lines force that even internal entities count
+ * as external (for the standalone check) if the declaration of
+ * the internal entity occurs in an external entity.
+ *)
+ if extdecl then
+ en # set_counts_as_external;
+
+ dtd # add_par_entity en;
+ }}
+ ? {{ match !yy_position with
+ ("ws1"|"ws2") -> raise(WF_error("Whitespace is missing"))
+ | "material" -> raise(WF_error("String literal or identifier expected"))
+ | "decl_rangle_entid" -> raise(WF_error("`>' expected"))
+ | _ -> raise(WF_error("Bad entity declaration"))
+ }}
+
+
+entitydef():
+ /* parses the definition value of a general entity. Returns either:
+ * - (Some s, None, None) meaning the definition of an internal entity
+ * with (literal) value s has been found
+ * - (None, Some x, None) meaning that an external parsed entity with
+ * external ID x has been found
+ * - (None, Some x, Some n) meaning that an unparsed entity with
+ * external ID x and notations n has been found
+ */
+ str:Unparsed_string
+ {{ Some str, None, None }}
+| id:external_id() ws:Ignore? Ignore* decl:ndatadecl()?
+ {{ if not ws && decl <> None then
+ raise(WF_error("Whitespace missing before `NDATA'"));
+ None, Some id, decl
+ }}
+
+
+pedef():
+ /* parses the definition value of a parameter entity. Returns either:
+ * - (Some s, None) meaning that the definition of an internal entity
+ * with (literal) value s has been found
+ * - (None, Some x) meaning that an external ID x has been found
+ */
+ str:Unparsed_string
+ {{ Some str, None }}
+| id:external_id()
+ {{ None, Some id }}
+
+
+ndatadecl():
+ /* Parses either NDATA "string" or the empty string; returns Some "string"
+ * in the former, None in the latter case.
+ */
+ ndata:Name ws:Ignore Ignore* name:Name
+ {{ if ndata = "NDATA" then
+ name
+ else
+ raise(WF_error("NDATA expected"))
+ }}
+ ? {{ match !yy_position with
+ "ws" -> raise(WF_error("Whitespace is missing after NDATA"))
+ | "name" -> raise(WF_error("Name expected"))
+ | _ -> raise(WF_error("Bad NDATA declaration"))
+ }}
+
+/**************************** NOTATION DECLARATION *******************/
+
+notationdecl():
+ /* parses <!NOTATION ... > and enters the notation declaration into the
+ * dtd object as side-effect
+ */
+ decl_notation_entid: Decl_notation
+ ws1: Ignore Ignore*
+ name: Name
+ ws2: Ignore Ignore*
+ sys_or_public: Name /* SYSTEM or PUBLIC */
+ ws3: Ignore Ignore*
+ str1: Unparsed_string
+ ws: Ignore? Ignore*
+ str2: Unparsed_string?
+ Ignore*
+ decl_rangle_entid: Decl_rangle
+ {{
+ if decl_notation_entid != decl_rangle_entid then
+ raise (Validation_error "Entities not properly nested with NOTATION declaration");
+ let xid =
+ (* Note that it is allowed that PUBLIC is only followed by one
+ * string literal
+ *)
+ match sys_or_public with
+ "SYSTEM" ->
+ if str2 <> None then raise(WF_error("SYSTEM must be followed only by one argument"));
+ System (recode_utf8 str1)
+ | "PUBLIC" ->
+ begin match str2 with
+ None ->
+ check_public_id str1;
+ Public(recode_utf8 str1,"")
+ | Some p ->
+ if not ws then
+ raise(WF_error("Missing whitespace between the string literals of the `PUBLIC' id"));
+ check_public_id str1;
+ Public(recode_utf8 str1, recode_utf8 p)
+ end
+ | _ -> raise(WF_error("PUBLIC or SYSTEM expected"))
+ in
+ if extend_dtd then begin
+ let no = new dtd_notation name xid config.encoding in
+ dtd # add_notation no
+ end
+ }}
+ ? {{ match !yy_position with
+ ("ws1"|"ws2"|"ws3") -> raise(WF_error("Whitespace is missing"))
+ | "name" -> raise(WF_error("Name expected"))
+ | "sys_or_public" -> raise(WF_error("SYSTEM or PUBLIC expected"))
+ | ("str1"|"str2") -> raise(WF_error("String literal expected"))
+ | "decl_rangle_entid" -> raise(WF_error("`>' expected"))
+ | _ -> raise(WF_error("Bad NOTATION declaration"))
+ }}
+
+/****************************** ELEMENTS **************************/
+
+/* In the following rules, the number of error rules is reduced to
+ * improve the performance of the parser.
+ */
+
+
+contents_start():
+ /* parses <element>...</element> misc*, i.e. exactly one element followed
+ * optionally by white space or processing instructions.
+ * The element is entered into the global variables as follows:
+ * - If elstack is non-empty, the parsed element is added as new child to
+ * the top element of the stack.
+ * - If elstack is empty, the root_examplar object is modified rather than
+ * that a new element is created. If additionally the variable root is
+ * None, it is assigned Some root_examplar.
+ * Note that the modification of the root_exemplar is done by the method
+ * internal_init.
+ * The reason why the root element is modified rather than newly created
+ * is a typing requirement. It must be possible that the class of the root
+ * is derived from the original class element_impl, i.e. the user must be
+ * able to add additional methods. If we created a new root object, we
+ * would have to denote to which class the new object belongs; the root
+ * would always be an 'element_impl' object (and not a derived object).
+ * If we instead cloned an exemplar object and modified it by the
+ * "create" method, the root object would belong to the same class as the
+ * exemplar (good), but the type of the parsing function would always
+ * state that an 'element_impl' was created (because we can pass the new
+ * object only back via a global variable). The only solution is to
+ * modify the object that has been passed to the parsing function directly.
+ */
+ $ {{ dtd <- transform_dtd dtd; }}
+ start_tag() content()*
+ {{ () }}
+
+
+content():
+ /* parses: start tags, end tags, content, or processing
+ * instructions. That the tags are properly nested is dynamically checked.
+ * As result, recognized elements are added to their parent elements,
+ * content is added to the element containing it, and processing instructions
+ * are entered into the element embracing them. (All as side-effects.)
+ */
+ start_tag()
+ {{ () }}
+| end_tag()
+ {{ () }}
+| char_data()
+ {{ () }}
+| cref()
+ {{ () }}
+| pi()
+ {{ () }}
+| entity_ref()
+ {{ () }}
+| comment()
+ {{ () }}
+
+
+entity_ref():
+ Begin_entity eref_xmldecl_then_rest()
+ {{ if n_tags_open = 0 then
+ raise(WF_error("Entity reference not allowed here"))
+ }}
+
+
+/* See comment for doc_mldecl_then_misc_then_prolog_then_rest. */
+
+eref_xmldecl_then_rest():
+ pl:PI_xml
+ $ {{ context.manager # current_entity # process_xmldecl pl;
+ }}
+ content()* End_entity
+ {{ () }}
+
+| $ {{ context.manager # current_entity # process_missing_xmldecl; }}
+ content() content()* End_entity
+ {{ () }}
+
+| $ {{ context.manager # current_entity # process_missing_xmldecl; }}
+ End_entity
+ {{ () }}
+
+
+start_tag():
+ /* parses <element attribute-values> or <element attribute-values/>.
+ *
+ * EFFECT: If elstack is non-empty, the element is added to the
+ * top element of the stack as new child, and the element
+ * is pushed on the stack. If elstack is empty, the root_exemplar is
+ * modified and gets the parsed name and attribute list. The root_exemplar
+ * is pushed on the stack. If additionally the variable root is empty, too,
+ * this variable is initialized.
+ * If the <element ... /> form has been parsed, no element is pushed
+ * on the stack.
+ */
+ tag: Tag_beg
+ $ {{ let position =
+ if config.store_element_positions then
+ Some(context.manager # position)
+ else
+ None
+ in
+ }}
+ ws: Ignore? Ignore*
+ attlist: attribute()*
+ emptiness: start_tag_rangle()
+ /* Note: it is guaranteed that there is whitespace between Tag_beg and
+ * the name of the first attribute, because there must be some separator.
+ * So we need not to check ws!
+ */
+ {{
+ let rec check_attlist al =
+ match al with
+ (nv1, num1) :: al' ->
+ if not num1 && al' <> [] then begin
+ match al with
+ ((n1,_),_) :: ((n2,_),_) :: _ ->
+ raise(WF_error("Whitespace is missing between attributes `" ^
+ n1 ^ "' and `" ^ n2 ^ "'"))
+ | _ -> assert false
+ end;
+ check_attlist al'
+ | [] -> ()
+ in
+ check_attlist attlist;
+
+ let name, tag_beg_entid = tag in
+ let attlist' = List.map (fun (nv,_) -> nv) attlist in
+ let d =
+ create_element_node ?position:position spec dtd name attlist' in
+
+ begin match id_index with
+ None -> ()
+ | Some idx ->
+ (* Put the ID attribute into the index, if present *)
+ begin try
+ let v = d # id_attribute_value in (* may raise Not_found *)
+ idx # add v d (* may raise ID_not_unique *)
+ with
+ Not_found ->
+ (* No ID attribute *)
+ ()
+ | ID_not_unique ->
+ (* There is already an ID with the same value *)
+ raise(Validation_error("ID not unique"))
+ end
+ end;
+
+ if n_tags_open = 0 then begin
+ if root = None then begin
+ (* We have found the begin tag of the root element. *)
+ if config.enable_super_root_node then begin
+ (* The user wants the super root instead of the real root.
+ * The real root element becomes the child of the VR.
+ *)
+ (* Assertion: self # current is the super root *)
+ assert (self # current # node_type = T_super_root);
+ root <- Some (self # current);
+ self # current # add_node d;
+ doc # init_root (self # current);
+ end
+ else begin
+ (* Normal behaviour: The user wants to get the real root. *)
+ root <- Some d;
+ doc # init_root d;
+ end;
+ end
+ else
+ (* We have found a second topmost element. This is illegal. *)
+ raise(WF_error("Document must consist of only one toplevel element"))
+ end
+ else begin
+ (* We have found some inner begin tag. *)
+ self # save_data; (* Save outstanding data material first *)
+ self # current # add_node d
+ end;
+
+ if emptiness then
+ (* An empty tag like <a/>. *)
+ d # local_validate ~use_dfa:config.validate_by_dfa ()
+ else begin
+ (* A non-empty tag. *)
+ Stack.push (d, tag_beg_entid) elstack;
+ n_tags_open <- n_tags_open + 1;
+ end;
+ }}
+ ? {{ match !yy_position with
+ "attlist" -> raise(WF_error("Bad attribute list"))
+ | "emptiness" -> raise(WF_error("`>' or `/>' expected"))
+ | _ -> raise(WF_error("Bad start tag"))
+ }}
+
+
+attribute():
+ /* Parses name="value" */
+ n:Name Ignore* Eq Ignore* v:attval() ws:Ignore? Ignore*
+ {{ (n,v), ws }}
+
+
+attval():
+ v:Attval
+ {{ expand_attvalue lexerset dtd v config.warner true }}
+| v:Attval_nl_normalized
+ {{ expand_attvalue lexerset dtd v config.warner false }}
+
+
+start_tag_rangle():
+ Rangle {{ false }}
+| Rangle_empty {{ true }}
+
+
+end_tag():
+ /* parses </element>.
+ * Pops the top element from the elstack and checks if it is the same
+ * element.
+ */
+ tag:Tag_end Ignore* Rangle
+ {{ let name, tag_end_entid = tag in
+ if n_tags_open = 0 then
+ raise(WF_error("End-tag without start-tag"));
+
+ self # save_data; (* Save outstanding data material first *)
+
+ let x, tag_beg_entid = Stack.pop elstack in
+ let x_name =
+ match x # node_type with
+ | T_element n -> n
+ | _ -> assert false
+ in
+ if name <> x_name then
+ raise(WF_error("End-tag does not match start-tag"));
+ if tag_beg_entid != tag_end_entid then
+ raise(WF_error("End-tag not in the same entity as the start-tag"));
+ x # local_validate ~use_dfa:config.validate_by_dfa ();
+
+ n_tags_open <- n_tags_open - 1;
+
+ assert (n_tags_open >= 0);
+
+ }}
+
+char_data():
+ /* Parses any literal characters not otherwise matching, and adds the
+ * characters to the top element of elstack.
+ * If elstack is empty, it is assumed that there is no surrounding
+ * element, and any non-white space character is forbidden.
+ */
+ data:CharData
+ {{
+ if n_tags_open = 0 then
+ (* only white space is allowed *)
+ self # only_whitespace data
+ else
+ self # collect_data data
+ (* We collect the chardata material until the next end tag is
+ * reached. Then the collected material will concatenated and
+ * stored as a single T_data node (see end_tag rule above)
+ * using save_data.
+ *)
+ }}
+| data:Cdata
+ {{
+ if n_tags_open = 0 then
+ raise (WF_error("CDATA section not allowed here"));
+ self # collect_data data
+ (* Also collect CDATA material *)
+ }}
+
+cref():
+ /* Parses &#...; and adds the character to the top element of elstack. */
+ code:CRef
+ {{
+ if n_tags_open = 0 then
+ (* No surrounding element: character references are not allowed *)
+ raise(WF_error("Character reference not allowed here"));
+ self # collect_data (character config.encoding config.warner code)
+ (* Also collect character references *)
+ }}
+
+pi():
+ /* Parses <?...?> (but not <?xml white-space ... ?>).
+ * If there is a top element in elstack, the processing instruction is added
+ * to this element.
+ */
+ pi: PI
+ {{
+ let position =
+ if config.store_element_positions then
+ Some(context.manager # position)
+ else
+ None
+ in
+ let target,value = pi in
+
+ if n_tags_open = 0 & not config.enable_super_root_node
+ then
+ doc # add_pinstr (new proc_instruction target value config.encoding)
+ else begin
+ (* Special case: if processing instructions are processed inline,
+ * they are wrapped into T_pinstr nodes.
+ *)
+ if config.enable_pinstr_nodes then begin
+ self # save_data; (* Save outstanding data material first *)
+ let pinstr = new proc_instruction target value config.encoding in
+ let wrapper = create_pinstr_node
+ ?position:position spec dtd pinstr in
+ wrapper # local_validate(); (* succeeds always *)
+ self # current # add_node wrapper;
+ end
+ else
+ (* Normal behaviour: Add the PI to the parent element. *)
+ self # current # add_pinstr
+ (new proc_instruction target value config.encoding)
+ end
+ }}
+
+
+comment():
+ /* Parses <!-- ... -->
+ */
+ Comment_begin
+ $ {{
+ let position =
+ if config.enable_comment_nodes && config.store_element_positions then
+ Some(context.manager # position)
+ else
+ None
+ in
+ }}
+ mat: Comment_material*
+ ce: Comment_end
+ {{
+ if config.enable_comment_nodes then begin
+ self # save_data; (* Save outstanding data material first *)
+ let comment_text = String.concat "" mat in
+ let wrapper = create_comment_node
+ ?position:position spec dtd comment_text in
+ wrapper # local_validate(); (* succeeds always *)
+ self # current # add_node wrapper;
+ end
+ }}
+ ? {{ match !yy_position with
+ | "ce" -> raise(WF_error("`-->' expected"))
+ | _ -> raise(WF_error("Bad comment"))
+ }}
+
+
+%%
+ (* The method "parse" continues here... *)
+
+ try
+ match start_symbol with
+ Ext_document ->
+ parse_ext_document context.current context.get_next
+ | Ext_declarations ->
+ parse_ext_declarations context.current context.get_next
+ | Ext_element ->
+ parse_ext_element context.current context.get_next
+ with
+ Not_found ->
+ raise Parsing.Parse_error
+
+ (*********** The method "parse" ends here *************)
+
+
+(**********************************************************************)
+
+(* Here ends the class definition: *)
+end
+;;
+
+(**********************************************************************)
+
+open Pxp_reader;;
+
+
+class default_ext =
+ object(self)
+ val mutable node = (None : ('a extension node as 'a) option)
+ method clone = {< >}
+ method node =
+ match node with
+ None ->
+ assert false
+ | Some n -> n
+ method set_node n =
+ node <- Some n
+ end
+;;
+
+
+let default_extension = new default_ext;;
+
+let default_spec =
+ make_spec_from_mapping
+ ~super_root_exemplar: (new element_impl default_extension)
+ ~comment_exemplar: (new element_impl default_extension)
+ ~default_pinstr_exemplar: (new element_impl default_extension)
+ ~data_exemplar: (new data_impl default_extension)
+ ~default_element_exemplar: (new element_impl default_extension)
+ ~element_mapping: (Hashtbl.create 1)
+ ()
+;;
+
+
+let idref_pass id_index root =
+ let error t att value =
+ let name =
+ match t # node_type with
+ T_element name -> name
+ | _ -> assert false
+ in
+ let text =
+ "Attribute `" ^ att ^ "' of element `" ^ name ^
+ "' refers to unknown ID `" ^ value ^ "'" in
+ let pos_ent, pos_line, pos_col = t # position in
+ if pos_line = 0 then
+ raise(Validation_error text)
+ else
+ raise(At("In entity " ^ pos_ent ^ " at line " ^
+ string_of_int pos_line ^ ", position " ^ string_of_int pos_col ^
+ ":\n",
+ Validation_error text))
+ in
+
+ let rec check_tree t =
+ let idref_atts = t # idref_attribute_names in
+ List.iter
+ (fun att ->
+ match t # attribute att with
+ Value s ->
+ begin try ignore(id_index # find s) with
+ Not_found ->
+ error t att s
+ end
+ | Valuelist l ->
+ List.iter
+ (fun s ->
+ try ignore(id_index # find s) with
+ Not_found ->
+ error t att s
+ )
+ l
+ | Implied_value -> ()
+ )
+ idref_atts;
+ List.iter check_tree (t # sub_nodes)
+ in
+ check_tree root
+;;
+
+
+exception Return_DTD of dtd;;
+ (* Used by extract_dtd_from_document_entity to jump out of the parser *)
+
+
+let call_parser ~configuration:cfg
+ ~source:src
+ ~dtd
+ ~extensible_dtd
+ ~document:doc
+ ~specification:spec
+ ~process_xmldecl
+ ~transform_dtd
+ ~(id_index : 'ext #index option)
+ ~use_document_entity
+ ~entry
+ ~init_lexer =
+ let e = cfg.errors_with_line_numbers in
+ let w = cfg.warner in
+ let r, en =
+ match src with
+ Entity(m,r') -> r', m dtd
+ | ExtID(xid,r') -> r',
+ if use_document_entity then
+ new document_entity
+ r' dtd "[toplevel]" w xid e
+ cfg.encoding
+ else
+ new external_entity
+ r' dtd "[toplevel]" w xid false e
+ cfg.encoding
+ in
+ r # init_rep_encoding cfg.encoding;
+ r # init_warner w;
+ en # set_debugging_mode (cfg.debugging_mode);
+ let pobj =
+ new parser_object
+ doc
+ dtd
+ extensible_dtd
+ cfg
+ r
+ spec
+ process_xmldecl
+ transform_dtd
+ (id_index :> 'ext index option)
+ in
+ let mgr = new entity_manager en in
+ en # open_entity true init_lexer;
+ begin try
+ let context = make_context mgr in
+ pobj # parse context entry;
+ ignore(en # close_entity);
+ with
+ Return_DTD d ->
+ ignore(en # close_entity);
+ raise(Return_DTD d)
+ | error ->
+ ignore(en # close_entity);
+ r # close_all;
+ let pos = mgr # position_string in
+ raise (At(pos, error))
+ end;
+ if cfg.idref_pass then begin
+ match id_index with
+ None -> ()
+ | Some idx ->
+ ( match pobj # root with
+ None -> ()
+ | Some root ->
+ idref_pass idx root;
+ )
+ end;
+ pobj
+
+
+let parse_dtd_entity cfg src =
+ (* Parse a DTD given as separate entity. *)
+ let dtd = new dtd cfg.warner cfg.encoding in
+ let doc = new document cfg.warner in
+ let pobj =
+ call_parser
+ ~configuration:cfg
+ ~source:src
+ ~dtd:dtd
+ ~extensible_dtd:true (* Extend the DTD by parsed declarations *)
+ ~document:doc
+ ~specification:default_spec
+ ~process_xmldecl:false (* The XML declaration is ignored
+ * (except 'encoding')
+ *)
+ ~transform_dtd:(fun x -> x) (* Do not transform the DTD *)
+ ~id_index: None
+ ~use_document_entity:false
+ ~entry:Ext_declarations (* Entry point of the grammar *)
+ ~init_lexer:Declaration (* The initially used lexer *)
+ in
+ dtd # validate;
+ if cfg.accept_only_deterministic_models then dtd # only_deterministic_models;
+ dtd
+;;
+
+
+let parse_content_entity ?id_index cfg src dtd spec =
+ (* Parse an element given as separate entity *)
+ dtd # validate; (* ensure that the DTD is valid *)
+ if cfg.accept_only_deterministic_models then dtd # only_deterministic_models;
+ let doc = new document cfg.warner in
+ let pobj =
+ call_parser
+ ~configuration:cfg
+ ~source:src
+ ~dtd:dtd
+ ~extensible_dtd:true (* Extend the DTD by parsed declarations *)
+ ~document:doc
+ ~specification:spec
+ ~process_xmldecl:false (* The XML declaration is ignored
+ * (except 'encoding')
+ *)
+ ~transform_dtd:(fun x -> x) (* Do not transform the DTD *)
+ ~id_index:(id_index :> 'ext index option)
+ ~use_document_entity:false
+ ~entry:Ext_element (* Entry point of the grammar *)
+ ~init_lexer:Content (* The initially used lexer *)
+ in
+ match pobj # root with
+ Some r -> r
+ | None -> raise(WF_error("No root element"))
+;;
+
+
+let parse_wfcontent_entity cfg src spec =
+ let dtd = new dtd cfg.warner cfg.encoding in
+ dtd # allow_arbitrary;
+ let doc = new document cfg.warner in
+ let pobj =
+ call_parser
+ ~configuration:cfg
+ ~source:src
+ ~dtd:dtd
+ ~extensible_dtd:false (* Do not extend the DTD *)
+ ~document:doc
+ ~specification:spec
+ ~process_xmldecl:false (* The XML declaration is ignored
+ * (except 'encoding')
+ *)
+ ~transform_dtd:(fun x -> x) (* Do not transform the DTD *)
+ ~id_index:None
+ ~use_document_entity:false
+ ~entry:Ext_element (* Entry point of the grammar *)
+ ~init_lexer:Content (* The initially used lexer *)
+ in
+ match pobj # root with
+ Some r -> r
+ | None -> raise(WF_error("No root element"))
+;;
+
+
+let iparse_document_entity ?(transform_dtd = (fun x -> x))
+ ?id_index
+ cfg0 src spec p_wf =
+ (* Parse an element given as separate entity *)
+ (* p_wf: 'true' if in well-formedness mode, 'false' if in validating mode *)
+ let cfg = { cfg0 with
+ recognize_standalone_declaration =
+ cfg0.recognize_standalone_declaration && (not p_wf)
+ } in
+ let dtd = new dtd cfg.warner cfg.encoding in
+ if p_wf then
+ dtd # allow_arbitrary;
+ let doc = new document cfg.warner in
+ let pobj =
+ call_parser
+ ~configuration:cfg
+ ~source:src
+ ~dtd:dtd
+ ~extensible_dtd:(not p_wf) (* Extend the DTD by parsed declarations
+ * only if in validating mode
+ *)
+ ~document:doc
+ ~specification:spec
+ ~process_xmldecl:true (* The XML declaration is processed *)
+ (* TODO: change to 'not p_wf' ? *)
+ ~transform_dtd:(fun dtd ->
+ let dtd' = transform_dtd dtd in
+ if cfg.accept_only_deterministic_models then
+ dtd' # only_deterministic_models;
+ dtd')
+
+ ~id_index:(id_index :> 'ext index option)
+ ~use_document_entity:true
+ ~entry:Ext_document (* Entry point of the grammar *)
+ ~init_lexer:Document (* The initially used lexer *)
+ in
+ pobj # doc
+;;
+
+
+let parse_document_entity ?(transform_dtd = (fun x -> x))
+ ?id_index
+ cfg src spec =
+ iparse_document_entity
+ ~transform_dtd:transform_dtd
+ ?id_index:(id_index : 'ext #index option :> 'ext index option)
+ cfg src spec false;;
+
+let parse_wfdocument_entity cfg src spec =
+ iparse_document_entity cfg src spec true;;
+
+let extract_dtd_from_document_entity cfg src =
+ let transform_dtd dtd = raise (Return_DTD dtd) in
+ try
+ let doc = parse_document_entity
+ ~transform_dtd:transform_dtd
+ cfg
+ src
+ default_spec in
+ (* Should not happen: *)
+ doc # dtd
+ with
+ Return_DTD dtd ->
+ (* The normal case: *)
+ dtd
+;;
+
+
+let default_config =
+ let w = new drop_warnings in
+ { warner = w;
+ errors_with_line_numbers = true;
+ enable_pinstr_nodes = false;
+ enable_super_root_node = false;
+ enable_comment_nodes = false;
+ encoding = `Enc_iso88591;
+ recognize_standalone_declaration = true;
+ store_element_positions = true;
+ idref_pass = false;
+ validate_by_dfa = true;
+ accept_only_deterministic_models = true;
+ debugging_mode = false;
+ }
+
+
+class [ 'ext ] hash_index =
+object
+ constraint 'ext = 'ext node #extension
+ val ht = (Hashtbl.create 100 : (string, 'ext node) Hashtbl.t)
+ method add s n =
+ try
+ ignore(Hashtbl.find ht s);
+ raise ID_not_unique
+ with
+ Not_found ->
+ Hashtbl.add ht s n
+
+ method find s = Hashtbl.find ht s
+ method index = ht
+end
+
+
+(* ======================================================================
+ * History:
+ *
+ * $Log$
+ * Revision 1.1 2000/11/17 09:57:29 lpadovan
+ * Initial revision
+ *
+ * Revision 1.14 2000/08/26 23:23:14 gerd
+ * Bug: from_file must not interpret the file name as URL path.
+ * Bug: When PI and comment nodes are generated, the collected data
+ * material must be saved first.
+ *
+ * Revision 1.13 2000/08/19 21:30:03 gerd
+ * Improved the error messages of the parser
+ *
+ * Revision 1.12 2000/08/18 20:16:25 gerd
+ * Implemented that Super root nodes, pinstr nodes and comment
+ * nodes are included into the document tree.
+ *
+ * Revision 1.11 2000/08/14 22:24:55 gerd
+ * Moved the module Pxp_encoding to the netstring package under
+ * the new name Netconversion.
+ *
+ * Revision 1.10 2000/07/23 02:16:33 gerd
+ * Support for DFAs.
+ *
+ * Revision 1.9 2000/07/14 13:57:29 gerd
+ * Added the id_index feature.
+ *
+ * Revision 1.8 2000/07/09 17:52:45 gerd
+ * New implementation for current_data.
+ * The position of elements is stored on demand.
+ *
+ * Revision 1.7 2000/07/09 01:00:35 gerd
+ * Improvement: It is now guaranteed that only one data node
+ * is added for consecutive character material.
+ *
+ * Revision 1.6 2000/07/08 16:27:29 gerd
+ * Cleaned up the functions calling the parser.
+ * New parser argument: transform_dtd.
+ * Implementations for 'extract_dtd_from_document_entity' and
+ * 'parse_wfcontent_entity'.
+ *
+ * Revision 1.5 2000/07/06 23:05:18 gerd
+ * Initializations of resolvers were missing.
+ *
+ * Revision 1.4 2000/07/06 22:11:01 gerd
+ * Fix: The creation of the non-virtual root element is protected
+ * in the same way as the virtual root element.
+ *
+ * Revision 1.3 2000/07/04 22:15:18 gerd
+ * Change: Using the new resolver capabilities.
+ * Still incomplete: the new extraction and parsing functions.
+ *
+ * Revision 1.2 2000/06/14 22:19:06 gerd
+ * Added checks such that it is impossible to mix encodings.
+ *
+ * Revision 1.1 2000/05/29 23:48:38 gerd
+ * Changed module names:
+ * Markup_aux into Pxp_aux
+ * Markup_codewriter into Pxp_codewriter
+ * Markup_document into Pxp_document
+ * Markup_dtd into Pxp_dtd
+ * Markup_entity into Pxp_entity
+ * Markup_lexer_types into Pxp_lexer_types
+ * Markup_reader into Pxp_reader
+ * Markup_types into Pxp_types
+ * Markup_yacc into Pxp_yacc
+ * See directory "compatibility" for (almost) compatible wrappers emulating
+ * Markup_document, Markup_dtd, Markup_reader, Markup_types, and Markup_yacc.
+ *
+ * ======================================================================
+ * Old logs from markup_yacc.m2y:
+ *
+ * Revision 1.9 2000/05/29 21:14:57 gerd
+ * Changed the type 'encoding' into a polymorphic variant.
+ *
+ * Revision 1.8 2000/05/27 19:26:19 gerd
+ * Change: The XML declaration is interpreted right after
+ * it has been parsed (no longer after the document): new function
+ * check_and_parse_xmldecl.
+ * When elements, attributes, and entities are declared
+ * it is stored whether the declaration happens in an external
+ * entity (for the standalone check).
+ * The option recognize_standalone_declaration is interpreted.
+ *
+ * Revision 1.7 2000/05/20 20:31:40 gerd
+ * Big change: Added support for various encodings of the
+ * internal representation.
+ *
+ * Revision 1.6 2000/05/14 21:51:24 gerd
+ * Change: Whitespace is handled by the grammar, and no longer
+ * by the entity.
+ *
+ * Revision 1.5 2000/05/14 17:50:54 gerd
+ * Updates because of changes in the token type.
+ *
+ * Revision 1.4 2000/05/11 22:09:17 gerd
+ * Fixed the remaining problems with conditional sections.
+ * This seems to be also a weakness of the XML spec!
+ *
+ * Revision 1.3 2000/05/09 00:02:44 gerd
+ * Conditional sections are now recognized by the parser.
+ * There seem some open questions; see the TODO comments!
+ *
+ * Revision 1.2 2000/05/08 22:01:44 gerd
+ * Introduced entity managers (see markup_entity.ml).
+ * The XML declaration is now recognized by the parser. If such
+ * a declaration is found, the method process_xmldecl of the currently
+ * active entity is called. If the first token is not an XML declaration,
+ * the method process_missing_xmldecl is called instead.
+ * Some minor changes.
+ *
+ * Revision 1.1 2000/05/06 23:21:49 gerd
+ * Initial revision.
+ *
+ *
+ * ======================================================================
+ *
+ * COPIED FROM REVISION 1.19 OF markup_yacc.mly
+ *
+ * Revision 1.19 2000/05/01 15:20:08 gerd
+ * "End tag matches start tag" is checked before "End tag in the
+ * same entity as start tag".
+ *
+ * Revision 1.18 2000/04/30 18:23:08 gerd
+ * Bigger change: Introduced the concept of virtual roots. First,
+ * this reduces the number of checks. Second, it makes it possible to
+ * return the virtual root to the caller instead of the real root (new
+ * config options 'virtual_root' and 'processing_instructions_inline').
+ * Minor changes because of better CR/CRLF handling.
+ *
+ * Revision 1.17 2000/03/13 23:47:46 gerd
+ * Updated because of interface changes. (See markup_yacc_shadow.mli
+ * rev. 1.8)
+ *
+ * Revision 1.16 2000/01/20 20:54:43 gerd
+ * New config.errors_with_line_numbers.
+ *
+ * Revision 1.15 1999/12/17 22:27:58 gerd
+ * Bugfix: The value of 'p_internal_subset' (an instance
+ * variable of the parser object) is to true when the internal subset
+ * begins, and is set to false when this subset ends. The error was
+ * that references to external entities within this subset did not
+ * set 'p_internal_subset' to false; this is now corrected by introducing
+ * the 'p_internal_subset_stack'.
+ * This is a typical example of how the code gets more and
+ * more complicated and that it is very difficult to really understand
+ * what is going on.
+ *
+ * Revision 1.14 1999/11/09 22:23:37 gerd
+ * Removed the invocation of "init_dtd" of the root document.
+ * This method is no longer available. The DTD is also passed to the
+ * document object by the root element, so nothing essential changes.
+ *
+ * Revision 1.13 1999/10/25 23:37:09 gerd
+ * Bugfix: The warning "More than one ATTLIST declaration for element
+ * type ..." is only generated if an ATTLIST is found while there are already
+ * attributes for the element.
+ *
+ * Revision 1.12 1999/09/01 23:08:38 gerd
+ * New frontend function: parse_wf_document. This simply uses
+ * a DTD that allows anything, and by the new parameter "extend_dtd" it is
+ * avoided that element, attlist, and notation declarations are added to this
+ * DTD. The idea is that this function simulates a well-formedness parser.
+ * Tag_beg, Tag_end carry the entity_id. The "elstack" stores the
+ * entity_id of the stacked tag. This was necessary because otherwise there
+ * are some examples to produces incorrectly nested elements.
+ * p_internal_subset is a variable that stores whether the internal
+ * subset is being parsed. This is important beacause entity declarations in
+ * internal subsets are not allowed to contain parameter references.
+ * It is checked if the "elstack" is empty after all has been parsed.
+ * Processing instructions outside DTDs and outside elements are now
+ * added to the document.
+ * The rules of mixed and regexp style content models have been
+ * separated. The code is now much simpler.
+ * Entity references outside elements are detected and rejected.
+ *
+ * Revision 1.11 1999/09/01 16:26:08 gerd
+ * Improved the quality of error messages.
+ *
+ * Revision 1.10 1999/08/31 19:13:31 gerd
+ * Added checks on proper PE nesting. The idea is that tokens such
+ * as Decl_element and Decl_rangle carry an entity ID with them. This ID
+ * is simply an object of type < >, i.e. you can only test on identity.
+ * The lexer always produces tokens with a dummy ID because it does not
+ * know which entity is the current one. The entity layer replaces the dummy
+ * ID with the actual ID. The parser checks that the IDs of pairs such as
+ * Decl_element and Decl_rangle are the same; otherwise a Validation_error
+ * is produced.
+ *
+ * Revision 1.9 1999/08/15 20:42:01 gerd
+ * Corrected a misleading message.
+ *
+ * Revision 1.8 1999/08/15 20:37:34 gerd
+ * Improved error messages.
+ * Bugfix: While parsing document entities, the subclass document_entity is
+ * now used instead of external_entity. The rules in document entities are a bit
+ * stronger.
+ *
+ * Revision 1.7 1999/08/15 14:03:59 gerd
+ * Empty documents are not allowed.
+ * "CDATA section not allowed here" is a WF_error, not a Validation_
+ * error.
+ *
+ * Revision 1.6 1999/08/15 02:24:19 gerd
+ * Removed some grammar rules that were used for testing.
+ * Documents without DTD can now have arbitrary elements (formerly
+ * they were not allowed to have any element).
+ *
+ * Revision 1.5 1999/08/14 22:57:20 gerd
+ * It is allowed that external entities are empty because the
+ * empty string is well-parsed for both declarations and contents. Empty
+ * entities can be referenced anywhere because the references are replaced
+ * by nothing. Because of this, the Begin_entity...End_entity brace is only
+ * inserted if the entity is non-empty. (Otherwise references to empty
+ * entities would not be allowed anywhere.)
+ * As a consequence, the grammar has been changed such that a
+ * single Eof is equivalent to Begin_entity,End_entity without content.
+ *
+ * Revision 1.4 1999/08/14 22:20:01 gerd
+ * The "config" slot has now a component "warner" which is
+ * an object with a "warn" method. This is used to warn about characters
+ * that cannot be represented in the Latin 1 alphabet.
+ * Furthermore, there is a new component "debugging_mode".
+ * Some Parse_error exceptions have been changed into Validation_error.
+ * The interfaces of functions/classes imported from other modules
+ * have changed; the invocations have been adapted.
+ * Contents may contain CDATA sections that have been forgotten.
+ *
+ * Revision 1.3 1999/08/11 15:00:41 gerd
+ * The Begin_entity ... End_entity brace is also possible in
+ * 'contents'.
+ * The configuration passed to the parsing object contains always
+ * the resolver that is actually used.
+ *
+ * Revision 1.2 1999/08/10 21:35:12 gerd
+ * The XML/encoding declaration at the beginning of entities is
+ * evaluated. In particular, entities have now a method "xml_declaration"
+ * which returns the name/value pairs of such a declaration. The "encoding"
+ * setting is interpreted by the entity itself; "version", and "standalone"
+ * are interpreted by Markup_yacc.parse_document_entity. Other settings
+ * are ignored (this does not conform to the standard; the standard prescribes
+ * that "version" MUST be given in the declaration of document; "standalone"
+ * and "encoding" CAN be declared; no other settings are allowed).
+ * TODO: The user should be warned if the standard is not exactly
+ * fulfilled. -- The "standalone" property is not checked yet.
+ *
+ * Revision 1.1 1999/08/10 00:35:52 gerd
+ * Initial revision.
+ *
+ *
+ *)