+++ /dev/null
-(* $Id$ -*- tuareg -*-
- * ----------------------------------------------------------------------
- * PXP: The polymorphic XML parser for Objective Caml.
- * Copyright by Gerd Stolpmann. See LICENSE for details.
- *)
-
-open Parsing
-open Pxp_types
-open Pxp_lexer_types
-open Pxp_dtd
-open Pxp_entity
-open Pxp_document
-open Pxp_aux
-
-(* Some types from the interface definition: *)
-
-exception ID_not_unique
-
-class type [ 'ext ] index =
-object
- constraint 'ext = 'ext node #extension
- method add : string -> 'ext node -> unit
- method find : string -> 'ext node
-end
-
-
-type config =
- { warner : collect_warnings;
- errors_with_line_numbers : bool;
- enable_pinstr_nodes : bool;
- enable_super_root_node : bool;
- enable_comment_nodes : bool;
- encoding : rep_encoding;
- recognize_standalone_declaration : bool;
- store_element_positions : bool;
- idref_pass : bool;
- validate_by_dfa : bool;
- accept_only_deterministic_models : bool;
- debugging_mode : bool;
- }
-
-type source =
- Entity of ((dtd -> Pxp_entity.entity) * Pxp_reader.resolver)
- | ExtID of (ext_id * Pxp_reader.resolver)
-
-
-type start_symbol =
- Ext_document
- | Ext_declarations
- | Ext_element
-
-
-type context =
- { mutable current : unit -> token; (* get the current token *)
- mutable get_next : unit -> token; (* go on to the next token; return it *)
- mutable current_token : token; (* This is the current token *)
- mutable manager : entity_manager; (* The entity manager *)
- }
-
-
-let make_context entity_manager =
- let c =
- { current = (fun _ -> assert false);
- get_next = (fun _ -> assert false);
- current_token = Eof;
- manager = entity_manager;
- }
- in
- (* Note that the function which is stored in get_next_ref can be changed
- * as a side-effect when an entity is opened or closed. The function in
- * c.get_next must be programmed such that always the current "get_next"
- * function is executed.
- *)
- let get_next_ref = entity_manager # yy_get_next_ref in
- c.current <- (fun () -> c.current_token);
- c.get_next <- (fun () -> let tok = !get_next_ref() in
- c.current_token <- tok;
- tok);
- ignore(c.get_next());
- c
-;;
-
-
-let from_channel ?system_encoding ?id:init_id ?fixenc ch =
-
- (* Reading from a channel works by modifying the algorithm of
- * resolve_as_file.
- *)
-
- let url_syntax = (* A syntax suitable for "file" URLs *)
- { Neturl.null_url_syntax with
- Neturl.url_enable_scheme = Neturl.Url_part_allowed;
- Neturl.url_enable_host = Neturl.Url_part_allowed;
- Neturl.url_enable_path = Neturl.Url_part_required;
- Neturl.url_accepts_8bits = true;
- }
- in
-
- let an_url =
- Neturl.make_url
- ~scheme: "file"
- ~host: ""
- ~path: [ "" ]
- url_syntax
- in
-
- let init_channel_done = ref false in
- (* Whether the first access to this source has already happened. *)
-
- (* The task of url_of_id is:
- * - When it is called the first time, and no init_id is present,
- * the URL file:/// is passed back (an_url). This forces that
- * absolute path names /path/dir/... will be interpreted as
- * file path names. (But relative path names will not work.)
- * - If an init_id has been passed, we can assume that the opened URL
- * is exactly this init_id. By raising Not_competent it is indicated
- * that the standard method is to be used for the interpretation of
- * the URL.
- * - Otherwise, the channel is already being read, and thus cannot again
- * opened. (This case is handled in channel_of_url.)
- *)
-
- let url_of_id xid =
- if !init_channel_done then begin
- (* Use the normal way of determining the URL of the ID: *)
- raise Pxp_reader.Not_competent
- end
- else begin
- match init_id with
- None ->
- an_url
- (* If the channel is not associated with any URL: Simply pass
- * the URL file:/// back.
- *)
- | Some the_init_id ->
- assert (the_init_id = xid);
- raise Pxp_reader.Not_competent
- (* If the channel is associated with a URL, the corresponding
- * ID must be passed when the first invocation happens.
- *)
- end
- in
-
- (* The task of channel_of_url:
- * - If it is called the first time ("else"), the channel is returned
- * - Otherwise, the channel is already being read, and thus cannot again
- * opened. By raising Not_competent it is signaled that the
- * resolve_as_file object must not continue to open the URL.
- *)
-
- let channel_of_url url =
- if !init_channel_done then
- raise Pxp_reader.Not_competent
- else begin
- init_channel_done := true;
- ch, fixenc
- end
- in
-
- let r =
- new Pxp_reader.resolve_as_file
- ?system_encoding:system_encoding
- ~url_of_id:url_of_id
- ~channel_of_url:channel_of_url
- ()
- in
-
- let init_xid =
- match init_id with
- None -> Anonymous
- | Some id ->
- (* Note: 'id' may be illegal (malformed); in this case, the first
- * invocation of url_of_id will raise Not_competent, and the 'open_in'
- * method will fail.
- *)
- id
- in
-
- ExtID(init_xid, r)
-;;
-
-
-let from_file ?system_encoding utf8_filename =
-
- let r =
- new Pxp_reader.resolve_as_file
- ?system_encoding:system_encoding
- ()
- in
-
- let utf8_abs_filename =
- if utf8_filename <> "" && utf8_filename.[0] = '/' then
- utf8_filename
- else
- Sys.getcwd() ^ "/" ^ utf8_filename
- in
-
- let syntax = { Neturl.ip_url_syntax with Neturl.url_accepts_8bits = true } in
- let url = Neturl.make_url
- ~scheme:"file"
- ~host:"localhost"
- ~path:(Neturl.split_path utf8_abs_filename)
- syntax
- in
-
- let xid = System (Neturl.string_of_url url) in
-
-
- ExtID(xid, r)
-;;
-
-
-let from_string ?fixenc s =
- let r =
- new Pxp_reader.resolve_read_this_string ?fixenc:fixenc s in
- ExtID(Anonymous, r)
-;;
-
-
-(**********************************************************************)
-
-class ['ext] parser_object
- init_doc init_dtd init_extend_dtd init_config init_resolver init_spec
- init_process_xmldecl transform_dtd id_index
- =
- object (self)
-
- (* Note that the 'ext parameter has been the motivation to make the
- * parser a class.
- *)
-
- val mutable dtd = init_dtd
- (* The DTD being parsed; or the DTD currently assumed *)
-
- val extend_dtd = init_extend_dtd
- (* Whether the DTD should be extended by ELEMENT, ATTLIST, and
- * NOTATION declarations or not. (True for validating mode,
- * false for well-formedness mode.)
- *)
-
- val transform_dtd = transform_dtd
- (* A function transforming the DTD *)
-
- val id_index = (id_index : 'ext index option)
- (* The ID index or None *)
-
- val process_xmldecl = init_process_xmldecl
- (* Whether the XML declaration is parsed and the found XML version
- * and standalone declaration are passed to 'doc'.
- *)
-
- val lexerset = Pxp_lexers.get_lexer_set (init_config.encoding)
-
- val doc = init_doc
- (* The current document *)
-
- method doc = (doc : 'ext document)
-
- val resolver = init_resolver
- (* The resolver for external IDs *)
-
- val config = init_config
- (* The current configuration *)
-
- val elstack = (Stack.create() : ('ext node * entity_id) Stack.t)
- (* The element stack containing all open elements, i.e. elements that
- * have begun by a start tag but that have not been finished (end tag).
- * If the parser sees a start tag, it creates the element and pushes it
- * on top of this stack. If the parser recognizes an end tag, it pulls
- * one element from the stack and checks if it has the same name as
- * given with the end tag.
- *
- * At initialization time, a special element is pushed on the stack,
- * the so-called super root. It is always the bottommost
- * element of the stack, and serves as a guard.
- * [See "initializer" below.]
- *)
-
- method current =
- (* Get the top element of the element stack *)
- try
- fst(Stack.top elstack)
- with
- Stack.Empty -> assert false
- (* Not possible, because the super root is always the element
- * at the bottom of the stack.
- *)
-
- val mutable n_tags_open = 0
- (* Number of begin tags that have been parsed and whose corresponding
- * end tags have not yet been parsed
- *)
-
- val mutable p_internal_subset = false
- (* true while parsing the internal subset - there are some additional
- * constraints for internal subsets, and because of this it must
- * be known whether the current declaration is contained in the
- * internal or external subset of the DTD.
- *)
-
- val mutable root = None
- (* Contains the root element (topmost element) while it is being parsed
- * and after it has been parsed.
- * This variable is None before the root element is seen.
- *)
-
- method root = root
-
- val spec = init_spec
- (* A hashtable that contains exemplar objects for the various element
- * types. If an element is parsed, the exemplar is looked up and
- * "cloned" (by the "create" method)
- *)
-
- val mutable current_data = []
- (* Collects character data. *)
-
- method collect_data s =
- (* Collects the character material 's' *)
- current_data <- s :: current_data
-
- method save_data =
- (* Puts the material collected in 'current_data' into a new
- * node, and appends this node as new sub node to 'current'
- *)
- match current_data with
- [] ->
- ()
- | [ str ] ->
- if str <> "" then
- self # current # add_node (create_data_node spec dtd str);
- current_data <- []
- | _ ->
- let count = List.fold_left
- (fun acc s -> acc + String.length s)
- 0
- current_data in
- let str = String.create count in
- let pos = ref count in
- List.iter
- (fun s ->
- let l = String.length s in
- pos := !pos - l;
- String.blit
- ~src:s
- ~src_pos:0
- ~dst:str
- ~dst_pos:(!pos)
- ~len:l
- )
- current_data;
- assert(!pos = 0);
- if str <> "" then
- self # current # add_node (create_data_node spec dtd str);
- current_data <- []
-
-
- method only_whitespace data =
- (* Checks that the string "data" contains only whitespace. On failure,
- * Validation_error is raised.
- *)
- let lexbuf = Lexing.from_string data in
- let t1 = lexerset.scan_name_string lexbuf in
- if t1 <> Ignore then
- raise(WF_error("Data not allowed here"));
- let t2 = lexerset.scan_name_string lexbuf in
- if t2 <> Eof then
- raise(WF_error("Data not allowed here"));
- ()
-
- initializer
- (* CHECKS: *)
- if config.encoding <> dtd # encoding then
- failwith("Encoding mismatch");
-
- (* --- Initialize 'elstack': Push the super-root on the stack. *)
- let super_root =
- if config.enable_super_root_node then
- create_super_root_node spec dtd
- else
- (* because spec may not contain an exemplar for the super root: *)
- create_no_node spec dtd
- in
- (* Move the super root or the emulation to the stack: *)
- Stack.push (super_root, (self :> entity_id)) elstack;
-
-
-
- (********* Here the method "parse" begins. The grammar below is
- * transformed to a local function of this method
- *)
-
- method parse context start_symbol =
-
- let parse_ignored_section yy_current yy_get_next =
- (* A special parser which should be used after <![IGNORE[.
- * It parses until the corresponding ]]> is found.
- *)
-
- while yy_current() = Ignore do
- ignore(yy_get_next());
- done;
-
- ( match yy_current() with
- Conditional_body _ -> ()
- | _ -> raise Parsing.Parse_error;
- );
-
- let en = context.manager # current_entity in
- let llev = ref 1 in
- while !llev >= 1 do
- let igntok = en # next_ignored_token in
- (* next_ignored_token: uses a special lexer that only
- * recognizes Conditional_begin and Conditional_end;
- * other character combinations are ignored.
- *)
- (* NOTE: next_ignored_token works much like yy_get_next,
- * but it does not set the current token!
- *)
- match igntok with
- Conditional_begin _ ->
- llev := !llev + 1
- | Conditional_end _ ->
- llev := !llev - 1;
- (* Because the loop may be exited now: *)
- context.current_token <- igntok;
- | (End_entity | Eof) ->
- raise Parsing.Parse_error
- | _ ->
- ()
- done;
-
- in
-
-
- let check_and_parse_xmldecl xmldecl =
- if process_xmldecl then begin
- let v, _, s = decode_doc_xml_pi (decode_xml_pi xmldecl) in
- check_version_num v;
- doc # init_xml_version v;
- let v = match s with
- None -> false
- | Some "yes" -> true
- | Some "no" -> false
- | _ -> raise (WF_error("Illegal 'standalone' declaration"))
- in
- if config.recognize_standalone_declaration then
- dtd # set_standalone_declaration v
- end
- in
-
- let recode_utf8 s =
- (* Recode 's' to UTF-8 *)
- if config.encoding = `Enc_utf8 then
- s (* No recoding necessary *)
- else
- Netconversion.recode_string
- ~in_enc:(config.encoding :> encoding) ~out_enc:`Enc_utf8 s
- in
-
-
-%%
-
-/* The following grammar looks similar to ocamlyacc grammars, but
- * ocamlyacc is actually not used to transform the grammar into a parser.
- * Instead, the parser generator m2parsergen is applied.
- *
- * The format of the grammar is different (see m2parsergen/README),
- * but I hope that you can understand most features immediately.
- *
- * The type of the parser is different: m2parsergen creates a top-down
- * parser while ocamlyacc generates a LALR-1 parser.
- *
- * The way the generated code is called is different: ocamlyacc produces
- * lots of top-level definitions whereas m2parsergen generates only
- * a local let-in-phrase. This is explained in the already mentioned
- * README file.
- */
-
-/* See Pxp_types.ml for comments to the various tokens */
-
-%token Begin_entity
-%token End_entity
-%token Comment_begin
-%token Comment_end
-%token Ignore
-%token Eq
-%token Rangle
-%token Rangle_empty
-%token <> Conditional_begin
-%token <> Conditional_body
-%token <> Conditional_end
-%token Percent
-%token Plus
-%token Star
-%token Bar
-%token Comma
-%token Qmark
-%token Pcdata
-%token Required
-%token Implied
-%token Fixed
-%token Eof
-
-%token <> Comment_material
-%token <> Doctype
-%token <> Doctype_rangle
-%token <> Dtd_begin
-%token <> Dtd_end
-%token <> Decl_element
-%token <> Decl_attlist
-%token <> Decl_entity
-%token <> Decl_notation
-%token <> Decl_rangle
-%token <> Lparen
-%token <> Rparen
-%token <> RparenPlus
-%token <> RparenStar
-%token <> RparenQmark
-
-%token <> Tag_beg
-%token <> Tag_end
-
-%token <> PI
-%token <> PI_xml
-%token <> Cdata
-%token <> CRef
-%token <> ERef
-%token <> PERef
-%token <> CharData
-%token <> LineEnd
-%token <> Name
-%token <> Nametoken
-%token <> Attval
-%token <> Attval_nl_normalized
-%token <> Unparsed_string
-
-/* START SYMBOLS:
- *
- * "ext_document": parses a complete XML document (i.e. containing a
- * <!DOCTYPE..> and an element)
- * "ext_declarations": parses an "external DTD subset", i.e. a sequence
- * of declarations
- * "ext_element": parses a single element (no <!DOCTYPE...> allowed);
- * the element needs not to be the root element of the
- * DTD
- *
- * The functions corresponding to these symbols return always () because
- * they only have side-effects.
- */
-
-/* SOME GENERAL COMMENTS:
- *
- * The parser does not get its tokens from the lexers directly. Instead of
- * this, there is an entity object between the parser and the lexers. This
- * object already handles:
- *
- * - References to general and parameter entities. The token stream is
- * modified such that tokens automatically come from the referenced entities.
- * External parameter entities and all general entities are embraced by
- * the two special tokens Begin_entity and End_entity. The parser must
- * check that these braces are correctly nested.
- */
-
-%%
-
-
-ext_document():
- Begin_entity
- doc_xmldecl_then_misc_then_prolog_then_rest() End_entity
- {{
- if n_tags_open <> 0 then
- raise(WF_error("Missing end tag"))
- }}
-
-
-/* In the following rule, we must find out whether there is an XML declaration
- * or not, and directly after that either "process_xmldecl" or
- * "process_missing_xmldecl" of the current entity must be called.
- * AND IT MUST BE DIRECTLY! Because of this, the invocation is carried out
- * in the "$" clause immediately following the first token.
- *
- * TODO: This is not enough. The first token may be a tag, and the tag
- * may already contain non-ASCII characters. (But in this case, the resolvers
- * assume UTF8, and they are right...)
- */
-
-doc_xmldecl_then_misc_then_prolog_then_rest():
- pl:PI_xml
- $ {{ context.manager # current_entity # process_xmldecl pl;
- check_and_parse_xmldecl pl;
- }}
- misc()* doc_prolog_then_rest()
- {{ () }}
-
-| $ {{ context.manager # current_entity # process_missing_xmldecl; }}
- misc() misc()* doc_prolog_then_rest()
- {{ () }}
-
-| $ {{ context.manager # current_entity # process_missing_xmldecl; }}
- doctypedecl() misc()* contents_start()
- {{ () }}
-
-| $ {{ context.manager # current_entity # process_missing_xmldecl; }}
- contents_start()
- {{ () }}
-
-
-doc_prolog_then_rest():
- doctypedecl() misc()* contents_start()
- {{ () }}
-| contents_start()
- {{ () }}
-
-
-ext_element():
- Begin_entity el_xmldecl_then_misc_then_rest() End_entity
- {{
- if n_tags_open <> 0 then
- raise(WF_error("Missing end tag"))
- }}
-
-
-/* See comment for doc_mldecl_then_misc_then_prolog_then_rest. */
-
-el_xmldecl_then_misc_then_rest():
- pl:PI_xml
- $ {{ context.manager # current_entity # process_xmldecl pl; }}
- misc()* contents_start()
- {{ () }}
-
-| $ {{ context.manager # current_entity # process_missing_xmldecl; }}
- misc() misc()* contents_start()
- {{ () }}
-
-| $ {{ context.manager # current_entity # process_missing_xmldecl; }}
- contents_start()
- {{ () }}
-
-
-ext_declarations():
- /* Parses a sequence of declarations given by an entity. As side-effect,
- * the parsed declarations are put into the dtd object.
- */
- Begin_entity decl_xmldecl_then_rest()
- {{ () }}
-| Eof
- {{ () }}
-
-
-decl_xmldecl_then_rest():
- /* Note: This rule is also called from declaration()! */
- pl:PI_xml
- $ {{ context.manager # current_entity # process_xmldecl pl;
- }}
- declaration()* End_entity
- {{ () }}
-
-| $ {{ context.manager # current_entity # process_missing_xmldecl; }}
- declaration() declaration()* End_entity
- {{ () }}
-
-| $ {{ context.manager # current_entity # process_missing_xmldecl; }}
- End_entity
- {{ () }}
-
-
-misc():
- pi()
- {{ () }}
-| data: CharData
- /* In this context, the lexers sometimes do not recognize white space;
- * instead CharData tokens containing white space are delivered.
- */
- {{ self # only_whitespace data }}
-| Ignore
- {{ () }}
-| comment()
- {{ () }}
-
-
-/********************* DOCUMENT TYPE DECLARATION *************************/
-
-doctypedecl():
- /* parses from <!DOCTYPE to >. As side-effect, first the declarations of
- * the internal DTD (if any) are put into !!on_dtd, then the declarations
- * of the external DTD (if any) are put into this DTD object.
- */
- doctype_entid: Doctype
- ws: Ignore Ignore*
- doctypedecl_material (doctype_entid)
- {{ () }}
- ? {{ match !yy_position with
- "ws" -> raise(WF_error("Whitespace is missing after `DOCTYPE'"))
- | _ -> raise(WF_error("Bad DOCTYPE declaration"))
- }}
-
-
-/* TRICK:
- * ws: Ignore? Ignore*
- * is meant seriously. The effect is that ws becomes a boolean variable
- * which is true if there is an Ignore token and false otherwise.
- * This construct is faster than just
- * ws: Ignore*
- * in which case ws becomes an integer variable containing the number of
- * Ignore tokens. Counting the number of tokens is slower than only checking
- * the existence.
- *
- * We need the information whether there is an Ignore token (representing
- * white space), because white space is only obligatory if also an identifier
- * for the external subset is parsed; this conditional syntax constraint is
- * simply programmed in the body of the grammar rule.
- */
-
-doctypedecl_material(doctype_entid):
- root_name: Name
- ws: Ignore? Ignore*
- external_subset: external_id()?
- Ignore*
- internal_subset: internal_dtd()?
- Ignore*
- doctype_rangle_entid: Doctype_rangle
- {{
- if doctype_entid != doctype_rangle_entid then
- raise (Validation_error("Entities not properly nested with DOCTYPE declaration"));
- dtd # set_root root_name;
- begin match external_subset, internal_subset with
- None, None -> () (* no DTD means no ID *)
- | None, Some _ -> dtd # set_id Internal
- | Some id, None -> dtd # set_id (External id)
- | Some id, Some _ -> dtd # set_id (Derived id)
- end;
- (* Get now the external doctype declaration. Note that the internal
- * subset has precedence and must be read first.
- *)
- begin match external_subset with
- None -> ()
- | Some id ->
- if not ws then
- raise(WF_error("Whitespace is missing after `DOCTYPE " ^
- root_name ^ "'"));
- let r' = resolver # clone in
- let pobj =
- new parser_object
- (new document config.warner)
- dtd
- extend_dtd
- config
- r'
- spec
- process_xmldecl
- (fun x -> x)
- None
- in
- let en = new external_entity r' dtd "[dtd]"
- config.warner id false config.errors_with_line_numbers
- config.encoding
- in
- en # set_debugging_mode (config.debugging_mode);
- let mgr = new entity_manager en in
- en # open_entity true Declaration;
- try
- let context = make_context mgr in
- pobj # parse context Ext_declarations;
- ignore(en # close_entity);
- with
- error ->
- ignore(en # close_entity);
- r' # close_all;
- let pos = mgr # position_string in
- raise (At(pos, error))
- end;
- dtd # validate
- }}
- ? {{
- match !yy_position with
- "doctype_rangle_entid" -> raise(WF_error("`>' expected"))
- | _ -> raise(WF_error("Bad DOCTYPE declaration"))
- }}
-
-/* Note that there are no keywords for SYSTEM or PUBLIC, as these would
- * be difficult to recognize in the lexical contexts. Because of this,
- * SYSTEM/PUBLIC is parsed as name, and the rule for everything after
- * SYSTEM/PUBLIC is computed dynamically.
- */
-
-external_id():
- tok:Name
- $ {{
- let followup =
- match tok with
- "SYSTEM" -> parse_system_id
- (* Apply the rule system_id (below) to parse the
- * rest of the ID
- *)
- | "PUBLIC" -> parse_public_id
- (* Apply the rule public_id (below) to parse the
- * rest of the ID
- *)
- | _ -> raise(WF_error("SYSTEM or PUBLIC expected"))
- in
- }}
- ws:Ignore Ignore*
- r:[followup]()
- {{ r }}
- ? {{ match !yy_position with
- "ws" -> raise(WF_error("Whitespace is missing after " ^ tok))
- | _ -> raise(WF_error("Bad SYSTEM or PUBLIC identifier"))
- }}
-
-
-system_id():
- str:Unparsed_string
- {{ System (recode_utf8 str) }}
-
-
-public_id():
- str1: Unparsed_string
- ws: Ignore Ignore*
- str2: Unparsed_string
- {{ check_public_id str1;
- Public(recode_utf8 str1, recode_utf8 str2)
- }}
- ? {{ match !yy_position with
- "ws" -> raise(WF_error("Whitespace is missing between the literals of the PUBLIC identifier"))
- | _ -> raise(WF_error("Bad PUBLIC identifier"))
- }}
-
-
-/* The internal subset: "[" declaration* "]". While parsing the declarations
- * the object variable p_internal_subset must be true; however, if there
- * are entity references, this variable must be reset to false during
- * the entity. (See the rule for "declaration" below.)
- */
-
-internal_dtd():
- dtd_begin_entid: internal_dtd_begin()
- declaration()*
- dtd_end_entid: internal_dtd_end()
- {{
- if dtd_begin_entid != dtd_end_entid then
- raise(Validation_error("Entities not properly nested with internal DTD subset"))
- }}
- ? {{ match !yy_position with
- "dtd_end_entid" -> raise(WF_error("`]' expected"))
- | _ -> raise(WF_error("Bad internal DTD subset"))
- }}
-
-
-internal_dtd_begin():
- Dtd_begin
- {{ assert (not p_internal_subset);
- p_internal_subset <- true }}
-
-
-internal_dtd_end():
- Dtd_end
- {{ assert p_internal_subset;
- p_internal_subset <- false }}
-
-
-declaration():
- /* Parses a single declaration (or processing instruction). As side-effect
- * the parsed declaration is stored into the dtd object.
- */
- elementdecl()
- {{ () }}
-| attlistdecl()
- {{ () }}
-| entid:Decl_entity ws:Ignore Ignore* e:entitydecl(entid)
- {{ () }}
- ? {{ match !yy_position with
- "ws" -> raise(WF_error("Whitespace is missing after ENTITY"))
- | "e" -> raise(WF_error("Name or `%' expected"))
- | _ -> raise(WF_error("Bad entity declaration"))
- }}
-| notationdecl()
- {{ () }}
-| pi: PI
- {{ let target, value = pi in
- let pi = new proc_instruction target value config.encoding in
- dtd # add_pinstr pi
- }}
-| Ignore
- {{ () }}
-| Comment_begin Comment_material* ce:Comment_end
- {{ () }}
- ? {{ match !yy_position with
- "ce" -> raise(WF_error("`-->' expected"))
- | _ -> raise(WF_error("Bad comment"))
- }}
-| Begin_entity
- $ {{ (* Set 'p_internal_subset' to 'false' until the matching 'end_entity'
- * rule is parsed. This allows unrestricted usage of parameter entities
- * within declarations of internal entities.
- *)
- let old_p_internal_subset = p_internal_subset in
- p_internal_subset <- false;
- }}
- decl_xmldecl_then_rest()
- {{ (* Restore the old value of 'p_internal_subset'. *)
- p_internal_subset <- old_p_internal_subset;
- ()
- }}
-| begin_entid:Conditional_begin
- $ {{ (* Check whether conditional sections are allowed at this position. *)
- if p_internal_subset then
- raise(WF_error("Restriction of the internal subset: Conditional sections not allowed"));
- }}
- Ignore*
- cond:conditional_section() end_entid:Conditional_end
- {{ (* Check whether Conditional_begin and Conditional_end are in the same
- * entity. (This restriction is explained in the file SPECS.)
- *)
- if begin_entid != end_entid then
- raise(Validation_error("The first and the last token of conditional sections must be in the same entity (additional restriction of this parser)"));
- }}
- ? {{ match !yy_position with
- "end_entid" -> raise(WF_error("`>]>' expected"))
- | "cond" -> raise(WF_error("INCLUDE or IGNORE expected"))
- | _ -> raise(WF_error("Bad conditional section"))
- }}
-
-/* The tokens INCLUDE/IGNORE are scanned as names, and the selection of the
- * right parsing rule is dynamic.
- * Note that parse_ignored_section is not defined by a grammar rule but
- * by a conventional let-binding above.
- */
-
-conditional_section():
- include_or_ignore:Name
- $ {{ let parsing_function =
- match include_or_ignore with
- "INCLUDE" -> parse_included_section
- (* invoke rule "included_section" below *)
- | "IGNORE" -> parse_ignored_section
- (* invoke function "parse_ignored_section" *)
- | _ -> raise(WF_error("INCLUDE or IGNORE expected"))
- in
- }}
- [ parsing_function ] ()
- {{ () }}
- ? {{ raise(WF_error("Bad conditional section")) }}
-
-included_section():
- Conditional_body declaration()*
- {{ () }}
-| Ignore Ignore* Conditional_body declaration()*
- {{ () }}
-
-
-/*************************** ELEMENT DECLARATIONS ********************/
-
-elementdecl():
- /* parses <!ELEMENT ... >. Puts the parsed element type as side-effect into
- * dtd.
- */
- decl_element_entid: Decl_element
- $ {{ let extdecl = context.manager # current_entity_counts_as_external in
- }}
- ws1: Ignore Ignore*
- name: Name
- ws2: Ignore Ignore*
- content_model: contentspec()
- Ignore*
- decl_rangle_entid: Decl_rangle
- {{
- if decl_element_entid != decl_rangle_entid then
- raise (Validation_error "Entities not properly nested with ELEMENT declaration");
- if extend_dtd then begin
- let el = new dtd_element dtd name in
- (* It is allowed that an <!ATTLIST...> precedes the corresponding
- * <!ELEMENT...>. Because of this it is possible that there is already
- * an element called 'name' in the DTD, and we only must set the content
- * model of this element.
- *)
- try
- dtd # add_element el;
- el # set_cm_and_extdecl content_model extdecl;
- with
- Not_found -> (* means: there is already an element 'name' *)
- let el' = dtd # element name in
- el' # set_cm_and_extdecl content_model extdecl;
- (* raises Validation_error if el' already has a content model *)
- end
- }}
- ? {{ match !yy_position with
- ("ws1"|"ws2") -> raise(WF_error("Whitespace is missing"))
- | "name" -> raise(WF_error("The name of the element is expected here"))
- | "content_model" -> raise(WF_error("Content model expression expected"))
- | "decl_rangle_entid" -> raise(WF_error("`>' expected"))
- | _ -> raise(WF_error("Bad element type declaration"))
- }}
-
-contentspec():
- /* parses a content model and returns it (type content_model_type) */
- name: Name /* EMPTY or ANY */
- {{ match name with
- "EMPTY" -> Empty
- | "ANY" -> Any
- | _ -> raise(WF_error("EMPTY, ANY, or a subexpression expected"))
- }}
-| entid:Lparen Ignore* term:mixed_or_regexp(entid)
- {{ term }}
- ? {{ raise(WF_error("Bad content model expression")) }}
-
-
-/* Many of the following rules have an lparen_entid argument. This is the
- * internal ID of the entity containing the corresponding left parenthesis;
- * by comparing it with the ID of the entity of the right parenthesis the
- * contraint is implemented that both parentheses must be in the same entity.
- */
-
-mixed_or_regexp(lparen_entid):
- re: choice_or_seq(lparen_entid)
- {{ Regexp re }}
-| m: mixed(lparen_entid)
- {{ m }}
-
-
-multiplier():
- /* returns one of the multiplier symbols (?,*,+) */
- Plus
- {{ Plus }}
-| Star
- {{ Star }}
-| Qmark
- {{ Qmark }}
-
-
-mixed (lparen_entid) :
- Pcdata
- Ignore*
- material: mixed_alternatives_top()
- {{
- let rest, rparen_entid = material in
- if lparen_entid != rparen_entid then
- raise (Validation_error "Entities not properly nested with parentheses");
- Mixed (MPCDATA :: rest)
- }}
- ? {{ raise(WF_error("Bad content model expression")) }}
-
-
-mixed_alternatives_top():
- entid: Rparen
- {{ [], entid }}
-| entid: RparenStar
- {{ [], entid }}
-| Bar Ignore* name:Name Ignore* names:mixed_alternative()* entid:RparenStar
- {{
- (MChild name :: names), entid
- }}
- ? {{ match !yy_position with
- "name" -> raise(WF_error("Name expected"))
- | "entid" -> raise(WF_error("`)*' expected"))
- | _ -> raise(WF_error("Bad content model expression"))
- }}
-
-
-mixed_alternative() :
- Bar Ignore* name:Name Ignore*
- {{ MChild name }}
- ? {{ match !yy_position with
- "name" -> raise(WF_error("Name expected"))
- | _ -> raise(WF_error("Bad content model expression"))
- }}
-
-
-
-choice_or_seq (lparen_entid):
- /* parses either a regular expression, or a mixed expression. Returns
- * Mixed spec or Regexp spec (content_model_type).
- * Which kind of expression (regexp or mixed) is being read is recognized
- * after the first subexpression has been parsed; the other subexpressions
- * must be of the same kind.
- */
- re: cp()
- Ignore*
- factor: choice_or_seq_factor()
- {{
- let (finalmark,subexpr), rparen_entid = factor in
- if lparen_entid != rparen_entid then
- raise (Validation_error "Entities not properly nested with parentheses");
- (* Check that the other subexpressions are "regexp", too, and
- * merge them with the first.
- *)
- let re' =
- match subexpr with
- Alt [] -> re
- | Alt alt -> Alt (re :: alt)
- | Seq seq -> Seq (re :: seq)
- | _ -> assert false
- in
- (* Interpret the finalmark. *)
- match finalmark with
- Ignore -> re'
- | Plus -> Repeated1 re'
- | Star -> Repeated re'
- | Qmark -> Optional re'
- | _ -> assert false
- }}
- ? {{ raise(WF_error("Bad content model expression")) }}
-
-choice_or_seq_factor():
- /* Parses "|<subexpr>|...)" or ",<subexpr>,...)", both forms optionally
- * followed by ?, *, or +.
- * Returns ((finalmark, expr), rparen_entid), where
- * - finalmark is the character after the right parenthesis or Ignore
- * - expr is either
- * Alt [] meaning that only ")" has been found
- * Alt non_empty_list meaning that the subexpressions are separated by '|'
- * Seq non_empty_list meaning that the subexpressions are separated by ','
- */
- entid:Rparen
- {{ (Ignore, Alt []), entid }}
-| entid:RparenPlus
- {{ (Plus, Alt []), entid }}
-| entid:RparenStar
- {{ (Star, Alt []), entid }}
-| entid:RparenQmark
- {{ (Qmark, Alt []), entid }}
-| Bar Ignore* re:cp() Ignore* factor:choice_or_seq_factor()
- {{
- let (finalmark, subexpr), rparen_entid = factor in
- begin match subexpr with
- Alt [] -> (finalmark, (Alt [re])), rparen_entid
- | Alt alt -> (finalmark, (Alt (re :: alt))), rparen_entid
- | _ -> raise(WF_error("It is not allowed to mix alternatives and sequences"))
- end
- }}
- ? {{ raise(WF_error("Bad content model expression")) }}
-| Comma Ignore* re:cp() Ignore* factor:choice_or_seq_factor()
- {{
- let (finalmark, subexpr), rparen_entid = factor in
- begin match subexpr with
- Alt [] -> (finalmark, (Seq [re])), rparen_entid
- | Seq seq -> (finalmark, (Seq (re :: seq))), rparen_entid
- | _ -> raise(WF_error("It is not allowed to mix alternatives and sequences"))
- end
- }}
- ? {{ raise(WF_error("Bad content model expression")) }}
-
-cp():
- /* parse either a name, or a parenthesized subexpression "(...)" */
- name:Name m:multiplier()?
- {{ match m with
- None -> Child name
- | Some Plus -> Repeated1 (Child name)
- | Some Star -> Repeated (Child name)
- | Some Qmark -> Optional (Child name)
- | _ -> assert false
- }}
- ? {{ raise(WF_error("Bad content model expression")) }}
-| entid:Lparen Ignore* m:choice_or_seq(entid)
- {{ m }}
- ? {{ raise(WF_error("Bad content model expression")) }}
-
-
-/********************* ATTRIBUTE LIST DECLARATION ***********************/
-
-attlistdecl():
- /* parses <!ATTLIST ... >. Enters the attribute list in dtd as side-
- * effect.
- */
- decl_attlist_entid: Decl_attlist
- $ {{ let extdecl = context.manager # current_entity_counts_as_external in
- }}
- ws1: Ignore Ignore*
- el_name: Name
- ws: Ignore? Ignore*
- factor: attdef_factor()
- {{
- let at_list, decl_rangle_entid = factor in
-
- if decl_attlist_entid != decl_rangle_entid then
- raise (Validation_error "Entities not properly nested with ATTLIST declaration");
-
- if not ws && at_list <> [] then begin
- match at_list with
- (name,_,_) :: _ ->
- (* This is normally impossible, because the lexer demands
- * some other token between two names.
- *)
- raise(WF_error("Whitespace is missing before `" ^ name ^ "'"));
- | _ -> assert false
- end;
-
- if extend_dtd then begin
- let new_el = new dtd_element dtd el_name in
- (* Note that it is allowed that <!ATTLIST...> precedes the corresponding
- * <!ELEMENT...> declaration. In this case we add the element declaration
- * already to the DTD but leave the content model unspecified.
- *)
- let el =
- try
- dtd # add_element new_el;
- new_el
- with
- Not_found -> (* already added *)
- let old_el = dtd # element el_name in
- if old_el # attribute_names <> [] then
- config.warner # warn ("More than one ATTLIST declaration for element type `" ^
- el_name ^ "'");
- old_el
- in
- List.iter
- (fun (a_name, a_type, a_default) ->
- el # add_attribute a_name a_type a_default extdecl)
- at_list
- end
- }}
- ? {{ match !yy_position with
- "ws1" -> raise(WF_error("Whitespace is missing after ATTLIST"))
- | "el_name" -> raise(WF_error("The name of the element is expected here"))
- | "factor" -> raise(WF_error("Another attribute name or `>' expected"))
- | _ -> raise(WF_error("Bad attribute declaration"))
- }}
-
-
-attdef_factor():
- /* parses a list of triples <name> <type> <default value> and returns the
- * list as (string * att_type * att_default) list.
- */
- attdef:attdef() ws:Ignore? Ignore* factor:attdef_factor()
- {{
- let attdef_rest, decl_rangle_entid = factor in
- if not ws && attdef_rest <> [] then begin
- match attdef_rest with
- (name,_,_) :: _ ->
- raise(WF_error("Missing whitespace before `" ^ name ^ "'"));
- | _ -> assert false
- end;
- (attdef :: attdef_rest), decl_rangle_entid }}
- ? {{ match !yy_position with
- | "factor" -> raise(WF_error("Another attribute name or `>' expected"))
- | _ -> raise(WF_error("Bad attribute declaration"))
- }}
-| entid:Decl_rangle
- {{ [], entid }}
-
-
-attdef():
- /* Parses a single triple */
- name: Name
- ws1: Ignore Ignore*
- tp: atttype()
- ws2: Ignore Ignore*
- default: defaultdecl()
- {{ (name,tp,default) }}
- ? {{ match !yy_position with
- ("ws1"|"ws2") -> raise(WF_error("Whitespace is missing"))
- | "tp" -> raise(WF_error("Type of attribute or `(' expected"))
- | "default" -> raise(WF_error("#REQUIRED, #IMPLIED, #FIXED or a string literal expected"))
- | _ -> raise(WF_error("Bad attribute declaration"))
- }}
-
-atttype():
- /* Parses an attribute type and returns it as att_type. */
- name: Name
- $ {{ let followup =
- if name = "NOTATION" then
- parse_notation
- else
- parse_never
- in
- }}
- nota: [followup]()?
- {{
- match name with
- "CDATA" -> A_cdata
- | "ID" -> A_id
- | "IDREF" -> A_idref
- | "IDREFS" -> A_idrefs
- | "ENTITY" -> A_entity
- | "ENTITIES" -> A_entities
- | "NMTOKEN" -> A_nmtoken
- | "NMTOKENS" -> A_nmtokens
- | "NOTATION" ->
- (match nota with
- None -> raise(WF_error("Error in NOTATION type (perhaps missing whitespace after NOTATION?)"))
- | Some n -> n
- )
- | _ -> raise(WF_error("One of CDATA, ID, IDREF, IDREFS, ENTITY, ENTITIES, NMTOKEN, NMTOKENS, NOTATION, or a subexpression expected"))
- }}
- ? {{ raise(WF_error("Bad attribute declaration (perhaps missing whitespace after NOTATION)")) }}
-
-| Lparen
- Ignore*
- name: name_or_nametoken()
- Ignore*
- names: nmtoken_factor()*
- rp: Rparen
- /* Enumeration */
- {{ A_enum(name :: names) }}
- ? {{ match !yy_position with
- "name" -> raise(WF_error("Name expected"))
- | "names" -> raise(WF_error("`|' and more names expected, or `)'"))
- | "rp" -> raise(WF_error("`|' and more names expected, or `)'"))
- | _ -> raise(WF_error("Bad enumeration type"))
- }}
-
-
-never():
- /* The always failing rule */
- $ {{ raise Not_found; }}
- Doctype /* questionable */
- {{ A_cdata (* Does not matter *)
- }}
-
-
-notation():
- Ignore Ignore*
- lp: Lparen
- Ignore*
- name: Name
- Ignore*
- names: notation_factor()*
- rp: Rparen
- {{ A_notation(name :: names) }}
- ? {{ match !yy_position with
- "lp" -> raise(WF_error("`(' expected"))
- | "name" -> raise(WF_error("Name expected"))
- | "names" -> raise(WF_error("`|' and more names expected, or `)'"))
- | "rp" -> raise(WF_error("`|' and more names expected, or `)'"))
- | _ -> raise(WF_error("Bad NOTATION type"))
- }}
-
-
-notation_factor():
- /* Parse "|<name>" and return the name */
- Bar Ignore* name:Name Ignore*
- {{ name }}
- ? {{ match !yy_position with
- "name" -> raise(WF_error("Name expected"))
- | _ -> raise(WF_error("Bad NOTATION type"))
- }}
-
-nmtoken_factor():
- /* Parse "|<nmtoken>" and return the nmtoken */
- Bar Ignore* n:name_or_nametoken() Ignore*
- {{ n }}
- ? {{ match !yy_position with
- "n" -> raise(WF_error("Nametoken expected"))
- | _ -> raise(WF_error("Bad enumeration type"))
- }}
-
-
-name_or_nametoken():
- n:Name {{ n }}
-| n:Nametoken {{ n }}
-
-
-/* The default values must be expanded and normalized. This has been implemented
- * by the function expand_attvalue.
- */
-
-
-defaultdecl():
- /* Parse the default value for an attribute and return it as att_default */
- Required
- {{ D_required }}
-| Implied
- {{ D_implied }}
-| Fixed ws:Ignore Ignore* str:Unparsed_string
- {{ D_fixed (expand_attvalue lexerset dtd str config.warner false) }}
- ? {{ match !yy_position with
- "ws" -> raise(WF_error("Whitespace is missing after #FIXED"))
- | "str" -> raise(WF_error("String literal expected"))
- | _ -> raise(WF_error("Bad #FIXED default value"))
- }}
-| str:Unparsed_string
- {{ D_default (expand_attvalue lexerset dtd str config.warner false) }}
-
-
-/**************************** ENTITY DECLARATION ***********************/
-
-entitydecl(decl_entity_entid):
- /* parses everything _after_ <!ENTITY until the matching >. The parsed
- * entity declaration is entered into the dtd object as side-effect.
- */
- name: Name
- $ {{ let extdecl = context.manager # current_entity_counts_as_external in
- }}
- ws: Ignore Ignore*
- material: entitydef()
- Ignore*
- decl_rangle_entid: Decl_rangle
- /* A general entity */
- {{
- if decl_entity_entid != decl_rangle_entid then
- raise (Validation_error "Entities not properly nested with ENTITY declaration");
- let en =
- (* Distinguish between
- * - internal entities
- * - external entities
- * - NDATA (unparsed) entities
- *)
- match material with
- (Some s, None, None) ->
- new internal_entity dtd name config.warner s p_internal_subset
- config.errors_with_line_numbers false config.encoding
- | (None, Some xid, None) ->
- new external_entity (resolver # clone) dtd name config.warner
- xid false config.errors_with_line_numbers
- config.encoding
-
- | (None, Some xid, Some n) ->
- (new ndata_entity name xid n config.encoding :> entity)
- | _ -> assert false
- in
- dtd # add_gen_entity en extdecl
- }}
- ? {{ match !yy_position with
- "ws" -> raise(WF_error("Whitespace is missing"))
- | "material" -> raise(WF_error("String literal or identifier expected"))
- | "decl_rangle_entid" -> raise(WF_error("`>' expected"))
- | _ -> raise(WF_error("Bad entity declaration"))
- }}
-
-| Percent
- $ {{ let extdecl = context.manager # current_entity_counts_as_external in
- }}
- ws1: Ignore Ignore*
- name: Name
- ws2: Ignore Ignore*
- material: pedef()
- Ignore*
- decl_rangle_entid: Decl_rangle
- /* A parameter entity */
- {{
- if decl_entity_entid != decl_rangle_entid then
- raise (Validation_error "Entities not properly nested with ENTITY declaration");
- let en =
- (* Distinguish between internal and external entities *)
- match material with
- (Some s, None) ->
- new internal_entity dtd name config.warner s p_internal_subset
- config.errors_with_line_numbers true config.encoding
- | (None, Some xid) ->
- new external_entity (resolver # clone) dtd name config.warner
- xid true config.errors_with_line_numbers
- config.encoding
- | _ -> assert false
- in
-
- (* The following two lines force that even internal entities count
- * as external (for the standalone check) if the declaration of
- * the internal entity occurs in an external entity.
- *)
- if extdecl then
- en # set_counts_as_external;
-
- dtd # add_par_entity en;
- }}
- ? {{ match !yy_position with
- ("ws1"|"ws2") -> raise(WF_error("Whitespace is missing"))
- | "material" -> raise(WF_error("String literal or identifier expected"))
- | "decl_rangle_entid" -> raise(WF_error("`>' expected"))
- | _ -> raise(WF_error("Bad entity declaration"))
- }}
-
-
-entitydef():
- /* parses the definition value of a general entity. Returns either:
- * - (Some s, None, None) meaning the definition of an internal entity
- * with (literal) value s has been found
- * - (None, Some x, None) meaning that an external parsed entity with
- * external ID x has been found
- * - (None, Some x, Some n) meaning that an unparsed entity with
- * external ID x and notations n has been found
- */
- str:Unparsed_string
- {{ Some str, None, None }}
-| id:external_id() ws:Ignore? Ignore* decl:ndatadecl()?
- {{ if not ws && decl <> None then
- raise(WF_error("Whitespace missing before `NDATA'"));
- None, Some id, decl
- }}
-
-
-pedef():
- /* parses the definition value of a parameter entity. Returns either:
- * - (Some s, None) meaning that the definition of an internal entity
- * with (literal) value s has been found
- * - (None, Some x) meaning that an external ID x has been found
- */
- str:Unparsed_string
- {{ Some str, None }}
-| id:external_id()
- {{ None, Some id }}
-
-
-ndatadecl():
- /* Parses either NDATA "string" or the empty string; returns Some "string"
- * in the former, None in the latter case.
- */
- ndata:Name ws:Ignore Ignore* name:Name
- {{ if ndata = "NDATA" then
- name
- else
- raise(WF_error("NDATA expected"))
- }}
- ? {{ match !yy_position with
- "ws" -> raise(WF_error("Whitespace is missing after NDATA"))
- | "name" -> raise(WF_error("Name expected"))
- | _ -> raise(WF_error("Bad NDATA declaration"))
- }}
-
-/**************************** NOTATION DECLARATION *******************/
-
-notationdecl():
- /* parses <!NOTATION ... > and enters the notation declaration into the
- * dtd object as side-effect
- */
- decl_notation_entid: Decl_notation
- ws1: Ignore Ignore*
- name: Name
- ws2: Ignore Ignore*
- sys_or_public: Name /* SYSTEM or PUBLIC */
- ws3: Ignore Ignore*
- str1: Unparsed_string
- ws: Ignore? Ignore*
- str2: Unparsed_string?
- Ignore*
- decl_rangle_entid: Decl_rangle
- {{
- if decl_notation_entid != decl_rangle_entid then
- raise (Validation_error "Entities not properly nested with NOTATION declaration");
- let xid =
- (* Note that it is allowed that PUBLIC is only followed by one
- * string literal
- *)
- match sys_or_public with
- "SYSTEM" ->
- if str2 <> None then raise(WF_error("SYSTEM must be followed only by one argument"));
- System (recode_utf8 str1)
- | "PUBLIC" ->
- begin match str2 with
- None ->
- check_public_id str1;
- Public(recode_utf8 str1,"")
- | Some p ->
- if not ws then
- raise(WF_error("Missing whitespace between the string literals of the `PUBLIC' id"));
- check_public_id str1;
- Public(recode_utf8 str1, recode_utf8 p)
- end
- | _ -> raise(WF_error("PUBLIC or SYSTEM expected"))
- in
- if extend_dtd then begin
- let no = new dtd_notation name xid config.encoding in
- dtd # add_notation no
- end
- }}
- ? {{ match !yy_position with
- ("ws1"|"ws2"|"ws3") -> raise(WF_error("Whitespace is missing"))
- | "name" -> raise(WF_error("Name expected"))
- | "sys_or_public" -> raise(WF_error("SYSTEM or PUBLIC expected"))
- | ("str1"|"str2") -> raise(WF_error("String literal expected"))
- | "decl_rangle_entid" -> raise(WF_error("`>' expected"))
- | _ -> raise(WF_error("Bad NOTATION declaration"))
- }}
-
-/****************************** ELEMENTS **************************/
-
-/* In the following rules, the number of error rules is reduced to
- * improve the performance of the parser.
- */
-
-
-contents_start():
- /* parses <element>...</element> misc*, i.e. exactly one element followed
- * optionally by white space or processing instructions.
- * The element is entered into the global variables as follows:
- * - If elstack is non-empty, the parsed element is added as new child to
- * the top element of the stack.
- * - If elstack is empty, the root_examplar object is modified rather than
- * that a new element is created. If additionally the variable root is
- * None, it is assigned Some root_examplar.
- * Note that the modification of the root_exemplar is done by the method
- * internal_init.
- * The reason why the root element is modified rather than newly created
- * is a typing requirement. It must be possible that the class of the root
- * is derived from the original class element_impl, i.e. the user must be
- * able to add additional methods. If we created a new root object, we
- * would have to denote to which class the new object belongs; the root
- * would always be an 'element_impl' object (and not a derived object).
- * If we instead cloned an exemplar object and modified it by the
- * "create" method, the root object would belong to the same class as the
- * exemplar (good), but the type of the parsing function would always
- * state that an 'element_impl' was created (because we can pass the new
- * object only back via a global variable). The only solution is to
- * modify the object that has been passed to the parsing function directly.
- */
- $ {{ dtd <- transform_dtd dtd; }}
- start_tag() content()*
- {{ () }}
-
-
-content():
- /* parses: start tags, end tags, content, or processing
- * instructions. That the tags are properly nested is dynamically checked.
- * As result, recognized elements are added to their parent elements,
- * content is added to the element containing it, and processing instructions
- * are entered into the element embracing them. (All as side-effects.)
- */
- start_tag()
- {{ () }}
-| end_tag()
- {{ () }}
-| char_data()
- {{ () }}
-| cref()
- {{ () }}
-| pi()
- {{ () }}
-| entity_ref()
- {{ () }}
-| comment()
- {{ () }}
-
-
-entity_ref():
- Begin_entity eref_xmldecl_then_rest()
- {{ if n_tags_open = 0 then
- raise(WF_error("Entity reference not allowed here"))
- }}
-
-
-/* See comment for doc_mldecl_then_misc_then_prolog_then_rest. */
-
-eref_xmldecl_then_rest():
- pl:PI_xml
- $ {{ context.manager # current_entity # process_xmldecl pl;
- }}
- content()* End_entity
- {{ () }}
-
-| $ {{ context.manager # current_entity # process_missing_xmldecl; }}
- content() content()* End_entity
- {{ () }}
-
-| $ {{ context.manager # current_entity # process_missing_xmldecl; }}
- End_entity
- {{ () }}
-
-
-start_tag():
- /* parses <element attribute-values> or <element attribute-values/>.
- *
- * EFFECT: If elstack is non-empty, the element is added to the
- * top element of the stack as new child, and the element
- * is pushed on the stack. If elstack is empty, the root_exemplar is
- * modified and gets the parsed name and attribute list. The root_exemplar
- * is pushed on the stack. If additionally the variable root is empty, too,
- * this variable is initialized.
- * If the <element ... /> form has been parsed, no element is pushed
- * on the stack.
- */
- tag: Tag_beg
- $ {{ let position =
- if config.store_element_positions then
- Some(context.manager # position)
- else
- None
- in
- }}
- ws: Ignore? Ignore*
- attlist: attribute()*
- emptiness: start_tag_rangle()
- /* Note: it is guaranteed that there is whitespace between Tag_beg and
- * the name of the first attribute, because there must be some separator.
- * So we need not to check ws!
- */
- {{
- let rec check_attlist al =
- match al with
- (nv1, num1) :: al' ->
- if not num1 && al' <> [] then begin
- match al with
- ((n1,_),_) :: ((n2,_),_) :: _ ->
- raise(WF_error("Whitespace is missing between attributes `" ^
- n1 ^ "' and `" ^ n2 ^ "'"))
- | _ -> assert false
- end;
- check_attlist al'
- | [] -> ()
- in
- check_attlist attlist;
-
- let name, tag_beg_entid = tag in
- let attlist' = List.map (fun (nv,_) -> nv) attlist in
- let d =
- create_element_node ?position:position spec dtd name attlist' in
-
- begin match id_index with
- None -> ()
- | Some idx ->
- (* Put the ID attribute into the index, if present *)
- begin try
- let v = d # id_attribute_value in (* may raise Not_found *)
- idx # add v d (* may raise ID_not_unique *)
- with
- Not_found ->
- (* No ID attribute *)
- ()
- | ID_not_unique ->
- (* There is already an ID with the same value *)
- raise(Validation_error("ID not unique"))
- end
- end;
-
- if n_tags_open = 0 then begin
- if root = None then begin
- (* We have found the begin tag of the root element. *)
- if config.enable_super_root_node then begin
- (* The user wants the super root instead of the real root.
- * The real root element becomes the child of the VR.
- *)
- (* Assertion: self # current is the super root *)
- assert (self # current # node_type = T_super_root);
- root <- Some (self # current);
- self # current # add_node d;
- doc # init_root (self # current);
- end
- else begin
- (* Normal behaviour: The user wants to get the real root. *)
- root <- Some d;
- doc # init_root d;
- end;
- end
- else
- (* We have found a second topmost element. This is illegal. *)
- raise(WF_error("Document must consist of only one toplevel element"))
- end
- else begin
- (* We have found some inner begin tag. *)
- self # save_data; (* Save outstanding data material first *)
- self # current # add_node d
- end;
-
- if emptiness then
- (* An empty tag like <a/>. *)
- d # local_validate ~use_dfa:config.validate_by_dfa ()
- else begin
- (* A non-empty tag. *)
- Stack.push (d, tag_beg_entid) elstack;
- n_tags_open <- n_tags_open + 1;
- end;
- }}
- ? {{ match !yy_position with
- "attlist" -> raise(WF_error("Bad attribute list"))
- | "emptiness" -> raise(WF_error("`>' or `/>' expected"))
- | _ -> raise(WF_error("Bad start tag"))
- }}
-
-
-attribute():
- /* Parses name="value" */
- n:Name Ignore* Eq Ignore* v:attval() ws:Ignore? Ignore*
- {{ (n,v), ws }}
-
-
-attval():
- v:Attval
- {{ expand_attvalue lexerset dtd v config.warner true }}
-| v:Attval_nl_normalized
- {{ expand_attvalue lexerset dtd v config.warner false }}
-
-
-start_tag_rangle():
- Rangle {{ false }}
-| Rangle_empty {{ true }}
-
-
-end_tag():
- /* parses </element>.
- * Pops the top element from the elstack and checks if it is the same
- * element.
- */
- tag:Tag_end Ignore* Rangle
- {{ let name, tag_end_entid = tag in
- if n_tags_open = 0 then
- raise(WF_error("End-tag without start-tag"));
-
- self # save_data; (* Save outstanding data material first *)
-
- let x, tag_beg_entid = Stack.pop elstack in
- let x_name =
- match x # node_type with
- | T_element n -> n
- | _ -> assert false
- in
- if name <> x_name then
- raise(WF_error("End-tag does not match start-tag"));
- if tag_beg_entid != tag_end_entid then
- raise(WF_error("End-tag not in the same entity as the start-tag"));
- x # local_validate ~use_dfa:config.validate_by_dfa ();
-
- n_tags_open <- n_tags_open - 1;
-
- assert (n_tags_open >= 0);
-
- }}
-
-char_data():
- /* Parses any literal characters not otherwise matching, and adds the
- * characters to the top element of elstack.
- * If elstack is empty, it is assumed that there is no surrounding
- * element, and any non-white space character is forbidden.
- */
- data:CharData
- {{
- if n_tags_open = 0 then
- (* only white space is allowed *)
- self # only_whitespace data
- else
- self # collect_data data
- (* We collect the chardata material until the next end tag is
- * reached. Then the collected material will concatenated and
- * stored as a single T_data node (see end_tag rule above)
- * using save_data.
- *)
- }}
-| data:Cdata
- {{
- if n_tags_open = 0 then
- raise (WF_error("CDATA section not allowed here"));
- self # collect_data data
- (* Also collect CDATA material *)
- }}
-
-cref():
- /* Parses &#...; and adds the character to the top element of elstack. */
- code:CRef
- {{
- if n_tags_open = 0 then
- (* No surrounding element: character references are not allowed *)
- raise(WF_error("Character reference not allowed here"));
- self # collect_data (character config.encoding config.warner code)
- (* Also collect character references *)
- }}
-
-pi():
- /* Parses <?...?> (but not <?xml white-space ... ?>).
- * If there is a top element in elstack, the processing instruction is added
- * to this element.
- */
- pi: PI
- {{
- let position =
- if config.store_element_positions then
- Some(context.manager # position)
- else
- None
- in
- let target,value = pi in
-
- if n_tags_open = 0 & not config.enable_super_root_node
- then
- doc # add_pinstr (new proc_instruction target value config.encoding)
- else begin
- (* Special case: if processing instructions are processed inline,
- * they are wrapped into T_pinstr nodes.
- *)
- if config.enable_pinstr_nodes then begin
- self # save_data; (* Save outstanding data material first *)
- let pinstr = new proc_instruction target value config.encoding in
- let wrapper = create_pinstr_node
- ?position:position spec dtd pinstr in
- wrapper # local_validate(); (* succeeds always *)
- self # current # add_node wrapper;
- end
- else
- (* Normal behaviour: Add the PI to the parent element. *)
- self # current # add_pinstr
- (new proc_instruction target value config.encoding)
- end
- }}
-
-
-comment():
- /* Parses <!-- ... -->
- */
- Comment_begin
- $ {{
- let position =
- if config.enable_comment_nodes && config.store_element_positions then
- Some(context.manager # position)
- else
- None
- in
- }}
- mat: Comment_material*
- ce: Comment_end
- {{
- if config.enable_comment_nodes then begin
- self # save_data; (* Save outstanding data material first *)
- let comment_text = String.concat "" mat in
- let wrapper = create_comment_node
- ?position:position spec dtd comment_text in
- wrapper # local_validate(); (* succeeds always *)
- self # current # add_node wrapper;
- end
- }}
- ? {{ match !yy_position with
- | "ce" -> raise(WF_error("`-->' expected"))
- | _ -> raise(WF_error("Bad comment"))
- }}
-
-
-%%
- (* The method "parse" continues here... *)
-
- try
- match start_symbol with
- Ext_document ->
- parse_ext_document context.current context.get_next
- | Ext_declarations ->
- parse_ext_declarations context.current context.get_next
- | Ext_element ->
- parse_ext_element context.current context.get_next
- with
- Not_found ->
- raise Parsing.Parse_error
-
- (*********** The method "parse" ends here *************)
-
-
-(**********************************************************************)
-
-(* Here ends the class definition: *)
-end
-;;
-
-(**********************************************************************)
-
-open Pxp_reader;;
-
-
-class default_ext =
- object(self)
- val mutable node = (None : ('a extension node as 'a) option)
- method clone = {< >}
- method node =
- match node with
- None ->
- assert false
- | Some n -> n
- method set_node n =
- node <- Some n
- end
-;;
-
-
-let default_extension = new default_ext;;
-
-let default_spec =
- make_spec_from_mapping
- ~super_root_exemplar: (new element_impl default_extension)
- ~comment_exemplar: (new element_impl default_extension)
- ~default_pinstr_exemplar: (new element_impl default_extension)
- ~data_exemplar: (new data_impl default_extension)
- ~default_element_exemplar: (new element_impl default_extension)
- ~element_mapping: (Hashtbl.create 1)
- ()
-;;
-
-
-let idref_pass id_index root =
- let error t att value =
- let name =
- match t # node_type with
- T_element name -> name
- | _ -> assert false
- in
- let text =
- "Attribute `" ^ att ^ "' of element `" ^ name ^
- "' refers to unknown ID `" ^ value ^ "'" in
- let pos_ent, pos_line, pos_col = t # position in
- if pos_line = 0 then
- raise(Validation_error text)
- else
- raise(At("In entity " ^ pos_ent ^ " at line " ^
- string_of_int pos_line ^ ", position " ^ string_of_int pos_col ^
- ":\n",
- Validation_error text))
- in
-
- let rec check_tree t =
- let idref_atts = t # idref_attribute_names in
- List.iter
- (fun att ->
- match t # attribute att with
- Value s ->
- begin try ignore(id_index # find s) with
- Not_found ->
- error t att s
- end
- | Valuelist l ->
- List.iter
- (fun s ->
- try ignore(id_index # find s) with
- Not_found ->
- error t att s
- )
- l
- | Implied_value -> ()
- )
- idref_atts;
- List.iter check_tree (t # sub_nodes)
- in
- check_tree root
-;;
-
-
-exception Return_DTD of dtd;;
- (* Used by extract_dtd_from_document_entity to jump out of the parser *)
-
-
-let call_parser ~configuration:cfg
- ~source:src
- ~dtd
- ~extensible_dtd
- ~document:doc
- ~specification:spec
- ~process_xmldecl
- ~transform_dtd
- ~(id_index : 'ext #index option)
- ~use_document_entity
- ~entry
- ~init_lexer =
- let e = cfg.errors_with_line_numbers in
- let w = cfg.warner in
- let r, en =
- match src with
- Entity(m,r') -> r', m dtd
- | ExtID(xid,r') -> r',
- if use_document_entity then
- new document_entity
- r' dtd "[toplevel]" w xid e
- cfg.encoding
- else
- new external_entity
- r' dtd "[toplevel]" w xid false e
- cfg.encoding
- in
- r # init_rep_encoding cfg.encoding;
- r # init_warner w;
- en # set_debugging_mode (cfg.debugging_mode);
- let pobj =
- new parser_object
- doc
- dtd
- extensible_dtd
- cfg
- r
- spec
- process_xmldecl
- transform_dtd
- (id_index :> 'ext index option)
- in
- let mgr = new entity_manager en in
- en # open_entity true init_lexer;
- begin try
- let context = make_context mgr in
- pobj # parse context entry;
- ignore(en # close_entity);
- with
- Return_DTD d ->
- ignore(en # close_entity);
- raise(Return_DTD d)
- | error ->
- ignore(en # close_entity);
- r # close_all;
- let pos = mgr # position_string in
- raise (At(pos, error))
- end;
- if cfg.idref_pass then begin
- match id_index with
- None -> ()
- | Some idx ->
- ( match pobj # root with
- None -> ()
- | Some root ->
- idref_pass idx root;
- )
- end;
- pobj
-
-
-let parse_dtd_entity cfg src =
- (* Parse a DTD given as separate entity. *)
- let dtd = new dtd cfg.warner cfg.encoding in
- let doc = new document cfg.warner in
- let pobj =
- call_parser
- ~configuration:cfg
- ~source:src
- ~dtd:dtd
- ~extensible_dtd:true (* Extend the DTD by parsed declarations *)
- ~document:doc
- ~specification:default_spec
- ~process_xmldecl:false (* The XML declaration is ignored
- * (except 'encoding')
- *)
- ~transform_dtd:(fun x -> x) (* Do not transform the DTD *)
- ~id_index: None
- ~use_document_entity:false
- ~entry:Ext_declarations (* Entry point of the grammar *)
- ~init_lexer:Declaration (* The initially used lexer *)
- in
- dtd # validate;
- if cfg.accept_only_deterministic_models then dtd # only_deterministic_models;
- dtd
-;;
-
-
-let parse_content_entity ?id_index cfg src dtd spec =
- (* Parse an element given as separate entity *)
- dtd # validate; (* ensure that the DTD is valid *)
- if cfg.accept_only_deterministic_models then dtd # only_deterministic_models;
- let doc = new document cfg.warner in
- let pobj =
- call_parser
- ~configuration:cfg
- ~source:src
- ~dtd:dtd
- ~extensible_dtd:true (* Extend the DTD by parsed declarations *)
- ~document:doc
- ~specification:spec
- ~process_xmldecl:false (* The XML declaration is ignored
- * (except 'encoding')
- *)
- ~transform_dtd:(fun x -> x) (* Do not transform the DTD *)
- ~id_index:(id_index :> 'ext index option)
- ~use_document_entity:false
- ~entry:Ext_element (* Entry point of the grammar *)
- ~init_lexer:Content (* The initially used lexer *)
- in
- match pobj # root with
- Some r -> r
- | None -> raise(WF_error("No root element"))
-;;
-
-
-let parse_wfcontent_entity cfg src spec =
- let dtd = new dtd cfg.warner cfg.encoding in
- dtd # allow_arbitrary;
- let doc = new document cfg.warner in
- let pobj =
- call_parser
- ~configuration:cfg
- ~source:src
- ~dtd:dtd
- ~extensible_dtd:false (* Do not extend the DTD *)
- ~document:doc
- ~specification:spec
- ~process_xmldecl:false (* The XML declaration is ignored
- * (except 'encoding')
- *)
- ~transform_dtd:(fun x -> x) (* Do not transform the DTD *)
- ~id_index:None
- ~use_document_entity:false
- ~entry:Ext_element (* Entry point of the grammar *)
- ~init_lexer:Content (* The initially used lexer *)
- in
- match pobj # root with
- Some r -> r
- | None -> raise(WF_error("No root element"))
-;;
-
-
-let iparse_document_entity ?(transform_dtd = (fun x -> x))
- ?id_index
- cfg0 src spec p_wf =
- (* Parse an element given as separate entity *)
- (* p_wf: 'true' if in well-formedness mode, 'false' if in validating mode *)
- let cfg = { cfg0 with
- recognize_standalone_declaration =
- cfg0.recognize_standalone_declaration && (not p_wf)
- } in
- let dtd = new dtd cfg.warner cfg.encoding in
- if p_wf then
- dtd # allow_arbitrary;
- let doc = new document cfg.warner in
- let pobj =
- call_parser
- ~configuration:cfg
- ~source:src
- ~dtd:dtd
- ~extensible_dtd:(not p_wf) (* Extend the DTD by parsed declarations
- * only if in validating mode
- *)
- ~document:doc
- ~specification:spec
- ~process_xmldecl:true (* The XML declaration is processed *)
- (* TODO: change to 'not p_wf' ? *)
- ~transform_dtd:(fun dtd ->
- let dtd' = transform_dtd dtd in
- if cfg.accept_only_deterministic_models then
- dtd' # only_deterministic_models;
- dtd')
-
- ~id_index:(id_index :> 'ext index option)
- ~use_document_entity:true
- ~entry:Ext_document (* Entry point of the grammar *)
- ~init_lexer:Document (* The initially used lexer *)
- in
- pobj # doc
-;;
-
-
-let parse_document_entity ?(transform_dtd = (fun x -> x))
- ?id_index
- cfg src spec =
- iparse_document_entity
- ~transform_dtd:transform_dtd
- ?id_index:(id_index : 'ext #index option :> 'ext index option)
- cfg src spec false;;
-
-let parse_wfdocument_entity cfg src spec =
- iparse_document_entity cfg src spec true;;
-
-let extract_dtd_from_document_entity cfg src =
- let transform_dtd dtd = raise (Return_DTD dtd) in
- try
- let doc = parse_document_entity
- ~transform_dtd:transform_dtd
- cfg
- src
- default_spec in
- (* Should not happen: *)
- doc # dtd
- with
- Return_DTD dtd ->
- (* The normal case: *)
- dtd
-;;
-
-
-let default_config =
- let w = new drop_warnings in
- { warner = w;
- errors_with_line_numbers = true;
- enable_pinstr_nodes = false;
- enable_super_root_node = false;
- enable_comment_nodes = false;
- encoding = `Enc_iso88591;
- recognize_standalone_declaration = true;
- store_element_positions = true;
- idref_pass = false;
- validate_by_dfa = true;
- accept_only_deterministic_models = true;
- debugging_mode = false;
- }
-
-
-class [ 'ext ] hash_index =
-object
- constraint 'ext = 'ext node #extension
- val ht = (Hashtbl.create 100 : (string, 'ext node) Hashtbl.t)
- method add s n =
- try
- ignore(Hashtbl.find ht s);
- raise ID_not_unique
- with
- Not_found ->
- Hashtbl.add ht s n
-
- method find s = Hashtbl.find ht s
- method index = ht
-end
-
-
-(* ======================================================================
- * History:
- *
- * $Log$
- * Revision 1.1 2000/11/17 09:57:29 lpadovan
- * Initial revision
- *
- * Revision 1.14 2000/08/26 23:23:14 gerd
- * Bug: from_file must not interpret the file name as URL path.
- * Bug: When PI and comment nodes are generated, the collected data
- * material must be saved first.
- *
- * Revision 1.13 2000/08/19 21:30:03 gerd
- * Improved the error messages of the parser
- *
- * Revision 1.12 2000/08/18 20:16:25 gerd
- * Implemented that Super root nodes, pinstr nodes and comment
- * nodes are included into the document tree.
- *
- * Revision 1.11 2000/08/14 22:24:55 gerd
- * Moved the module Pxp_encoding to the netstring package under
- * the new name Netconversion.
- *
- * Revision 1.10 2000/07/23 02:16:33 gerd
- * Support for DFAs.
- *
- * Revision 1.9 2000/07/14 13:57:29 gerd
- * Added the id_index feature.
- *
- * Revision 1.8 2000/07/09 17:52:45 gerd
- * New implementation for current_data.
- * The position of elements is stored on demand.
- *
- * Revision 1.7 2000/07/09 01:00:35 gerd
- * Improvement: It is now guaranteed that only one data node
- * is added for consecutive character material.
- *
- * Revision 1.6 2000/07/08 16:27:29 gerd
- * Cleaned up the functions calling the parser.
- * New parser argument: transform_dtd.
- * Implementations for 'extract_dtd_from_document_entity' and
- * 'parse_wfcontent_entity'.
- *
- * Revision 1.5 2000/07/06 23:05:18 gerd
- * Initializations of resolvers were missing.
- *
- * Revision 1.4 2000/07/06 22:11:01 gerd
- * Fix: The creation of the non-virtual root element is protected
- * in the same way as the virtual root element.
- *
- * Revision 1.3 2000/07/04 22:15:18 gerd
- * Change: Using the new resolver capabilities.
- * Still incomplete: the new extraction and parsing functions.
- *
- * Revision 1.2 2000/06/14 22:19:06 gerd
- * Added checks such that it is impossible to mix encodings.
- *
- * Revision 1.1 2000/05/29 23:48:38 gerd
- * Changed module names:
- * Markup_aux into Pxp_aux
- * Markup_codewriter into Pxp_codewriter
- * Markup_document into Pxp_document
- * Markup_dtd into Pxp_dtd
- * Markup_entity into Pxp_entity
- * Markup_lexer_types into Pxp_lexer_types
- * Markup_reader into Pxp_reader
- * Markup_types into Pxp_types
- * Markup_yacc into Pxp_yacc
- * See directory "compatibility" for (almost) compatible wrappers emulating
- * Markup_document, Markup_dtd, Markup_reader, Markup_types, and Markup_yacc.
- *
- * ======================================================================
- * Old logs from markup_yacc.m2y:
- *
- * Revision 1.9 2000/05/29 21:14:57 gerd
- * Changed the type 'encoding' into a polymorphic variant.
- *
- * Revision 1.8 2000/05/27 19:26:19 gerd
- * Change: The XML declaration is interpreted right after
- * it has been parsed (no longer after the document): new function
- * check_and_parse_xmldecl.
- * When elements, attributes, and entities are declared
- * it is stored whether the declaration happens in an external
- * entity (for the standalone check).
- * The option recognize_standalone_declaration is interpreted.
- *
- * Revision 1.7 2000/05/20 20:31:40 gerd
- * Big change: Added support for various encodings of the
- * internal representation.
- *
- * Revision 1.6 2000/05/14 21:51:24 gerd
- * Change: Whitespace is handled by the grammar, and no longer
- * by the entity.
- *
- * Revision 1.5 2000/05/14 17:50:54 gerd
- * Updates because of changes in the token type.
- *
- * Revision 1.4 2000/05/11 22:09:17 gerd
- * Fixed the remaining problems with conditional sections.
- * This seems to be also a weakness of the XML spec!
- *
- * Revision 1.3 2000/05/09 00:02:44 gerd
- * Conditional sections are now recognized by the parser.
- * There seem some open questions; see the TODO comments!
- *
- * Revision 1.2 2000/05/08 22:01:44 gerd
- * Introduced entity managers (see markup_entity.ml).
- * The XML declaration is now recognized by the parser. If such
- * a declaration is found, the method process_xmldecl of the currently
- * active entity is called. If the first token is not an XML declaration,
- * the method process_missing_xmldecl is called instead.
- * Some minor changes.
- *
- * Revision 1.1 2000/05/06 23:21:49 gerd
- * Initial revision.
- *
- *
- * ======================================================================
- *
- * COPIED FROM REVISION 1.19 OF markup_yacc.mly
- *
- * Revision 1.19 2000/05/01 15:20:08 gerd
- * "End tag matches start tag" is checked before "End tag in the
- * same entity as start tag".
- *
- * Revision 1.18 2000/04/30 18:23:08 gerd
- * Bigger change: Introduced the concept of virtual roots. First,
- * this reduces the number of checks. Second, it makes it possible to
- * return the virtual root to the caller instead of the real root (new
- * config options 'virtual_root' and 'processing_instructions_inline').
- * Minor changes because of better CR/CRLF handling.
- *
- * Revision 1.17 2000/03/13 23:47:46 gerd
- * Updated because of interface changes. (See markup_yacc_shadow.mli
- * rev. 1.8)
- *
- * Revision 1.16 2000/01/20 20:54:43 gerd
- * New config.errors_with_line_numbers.
- *
- * Revision 1.15 1999/12/17 22:27:58 gerd
- * Bugfix: The value of 'p_internal_subset' (an instance
- * variable of the parser object) is to true when the internal subset
- * begins, and is set to false when this subset ends. The error was
- * that references to external entities within this subset did not
- * set 'p_internal_subset' to false; this is now corrected by introducing
- * the 'p_internal_subset_stack'.
- * This is a typical example of how the code gets more and
- * more complicated and that it is very difficult to really understand
- * what is going on.
- *
- * Revision 1.14 1999/11/09 22:23:37 gerd
- * Removed the invocation of "init_dtd" of the root document.
- * This method is no longer available. The DTD is also passed to the
- * document object by the root element, so nothing essential changes.
- *
- * Revision 1.13 1999/10/25 23:37:09 gerd
- * Bugfix: The warning "More than one ATTLIST declaration for element
- * type ..." is only generated if an ATTLIST is found while there are already
- * attributes for the element.
- *
- * Revision 1.12 1999/09/01 23:08:38 gerd
- * New frontend function: parse_wf_document. This simply uses
- * a DTD that allows anything, and by the new parameter "extend_dtd" it is
- * avoided that element, attlist, and notation declarations are added to this
- * DTD. The idea is that this function simulates a well-formedness parser.
- * Tag_beg, Tag_end carry the entity_id. The "elstack" stores the
- * entity_id of the stacked tag. This was necessary because otherwise there
- * are some examples to produces incorrectly nested elements.
- * p_internal_subset is a variable that stores whether the internal
- * subset is being parsed. This is important beacause entity declarations in
- * internal subsets are not allowed to contain parameter references.
- * It is checked if the "elstack" is empty after all has been parsed.
- * Processing instructions outside DTDs and outside elements are now
- * added to the document.
- * The rules of mixed and regexp style content models have been
- * separated. The code is now much simpler.
- * Entity references outside elements are detected and rejected.
- *
- * Revision 1.11 1999/09/01 16:26:08 gerd
- * Improved the quality of error messages.
- *
- * Revision 1.10 1999/08/31 19:13:31 gerd
- * Added checks on proper PE nesting. The idea is that tokens such
- * as Decl_element and Decl_rangle carry an entity ID with them. This ID
- * is simply an object of type < >, i.e. you can only test on identity.
- * The lexer always produces tokens with a dummy ID because it does not
- * know which entity is the current one. The entity layer replaces the dummy
- * ID with the actual ID. The parser checks that the IDs of pairs such as
- * Decl_element and Decl_rangle are the same; otherwise a Validation_error
- * is produced.
- *
- * Revision 1.9 1999/08/15 20:42:01 gerd
- * Corrected a misleading message.
- *
- * Revision 1.8 1999/08/15 20:37:34 gerd
- * Improved error messages.
- * Bugfix: While parsing document entities, the subclass document_entity is
- * now used instead of external_entity. The rules in document entities are a bit
- * stronger.
- *
- * Revision 1.7 1999/08/15 14:03:59 gerd
- * Empty documents are not allowed.
- * "CDATA section not allowed here" is a WF_error, not a Validation_
- * error.
- *
- * Revision 1.6 1999/08/15 02:24:19 gerd
- * Removed some grammar rules that were used for testing.
- * Documents without DTD can now have arbitrary elements (formerly
- * they were not allowed to have any element).
- *
- * Revision 1.5 1999/08/14 22:57:20 gerd
- * It is allowed that external entities are empty because the
- * empty string is well-parsed for both declarations and contents. Empty
- * entities can be referenced anywhere because the references are replaced
- * by nothing. Because of this, the Begin_entity...End_entity brace is only
- * inserted if the entity is non-empty. (Otherwise references to empty
- * entities would not be allowed anywhere.)
- * As a consequence, the grammar has been changed such that a
- * single Eof is equivalent to Begin_entity,End_entity without content.
- *
- * Revision 1.4 1999/08/14 22:20:01 gerd
- * The "config" slot has now a component "warner" which is
- * an object with a "warn" method. This is used to warn about characters
- * that cannot be represented in the Latin 1 alphabet.
- * Furthermore, there is a new component "debugging_mode".
- * Some Parse_error exceptions have been changed into Validation_error.
- * The interfaces of functions/classes imported from other modules
- * have changed; the invocations have been adapted.
- * Contents may contain CDATA sections that have been forgotten.
- *
- * Revision 1.3 1999/08/11 15:00:41 gerd
- * The Begin_entity ... End_entity brace is also possible in
- * 'contents'.
- * The configuration passed to the parsing object contains always
- * the resolver that is actually used.
- *
- * Revision 1.2 1999/08/10 21:35:12 gerd
- * The XML/encoding declaration at the beginning of entities is
- * evaluated. In particular, entities have now a method "xml_declaration"
- * which returns the name/value pairs of such a declaration. The "encoding"
- * setting is interpreted by the entity itself; "version", and "standalone"
- * are interpreted by Markup_yacc.parse_document_entity. Other settings
- * are ignored (this does not conform to the standard; the standard prescribes
- * that "version" MUST be given in the declaration of document; "standalone"
- * and "encoding" CAN be declared; no other settings are allowed).
- * TODO: The user should be warned if the standard is not exactly
- * fulfilled. -- The "standalone" property is not checked yet.
- *
- * Revision 1.1 1999/08/10 00:35:52 gerd
- * Initial revision.
- *
- *
- *)