2 * ----------------------------------------------------------------------
3 * PXP: The polymorphic XML parser for Objective Caml.
4 * Copyright by Gerd Stolpmann. See LICENSE for details.
15 (* Some types from the interface definition: *)
17 exception ID_not_unique
19 class type [ 'ext ] index =
21 constraint 'ext = 'ext node #extension
22 method add : string -> 'ext node -> unit
23 method find : string -> 'ext node
28 { warner : collect_warnings;
29 errors_with_line_numbers : bool;
30 enable_pinstr_nodes : bool;
31 enable_super_root_node : bool;
32 enable_comment_nodes : bool;
33 encoding : rep_encoding;
34 recognize_standalone_declaration : bool;
35 store_element_positions : bool;
37 validate_by_dfa : bool;
38 accept_only_deterministic_models : bool;
39 debugging_mode : bool;
43 Entity of ((dtd -> Pxp_entity.entity) * Pxp_reader.resolver)
44 | ExtID of (ext_id * Pxp_reader.resolver)
54 { mutable current : unit -> token; (* get the current token *)
55 mutable get_next : unit -> token; (* go on to the next token; return it *)
56 mutable current_token : token; (* This is the current token *)
57 mutable manager : entity_manager; (* The entity manager *)
61 let make_context entity_manager =
63 { current = (fun _ -> assert false);
64 get_next = (fun _ -> assert false);
66 manager = entity_manager;
69 (* Note that the function which is stored in get_next_ref can be changed
70 * as a side-effect when an entity is opened or closed. The function in
71 * c.get_next must be programmed such that always the current "get_next"
72 * function is executed.
74 let get_next_ref = entity_manager # yy_get_next_ref in
75 c.current <- (fun () -> c.current_token);
76 c.get_next <- (fun () -> let tok = !get_next_ref() in
77 c.current_token <- tok;
84 let from_channel ?system_encoding ?id:init_id ?fixenc ch =
86 (* Reading from a channel works by modifying the algorithm of
90 let url_syntax = (* A syntax suitable for "file" URLs *)
91 { Neturl.null_url_syntax with
92 Neturl.url_enable_scheme = Neturl.Url_part_allowed;
93 Neturl.url_enable_host = Neturl.Url_part_allowed;
94 Neturl.url_enable_path = Neturl.Url_part_required;
95 Neturl.url_accepts_8bits = true;
107 let init_channel_done = ref false in
108 (* Whether the first access to this source has already happened. *)
110 (* The task of url_of_id is:
111 * - When it is called the first time, and no init_id is present,
112 * the URL file:/// is passed back (an_url). This forces that
113 * absolute path names /path/dir/... will be interpreted as
114 * file path names. (But relative path names will not work.)
115 * - If an init_id has been passed, we can assume that the opened URL
116 * is exactly this init_id. By raising Not_competent it is indicated
117 * that the standard method is to be used for the interpretation of
119 * - Otherwise, the channel is already being read, and thus cannot again
120 * opened. (This case is handled in channel_of_url.)
124 if !init_channel_done then begin
125 (* Use the normal way of determining the URL of the ID: *)
126 raise Pxp_reader.Not_competent
132 (* If the channel is not associated with any URL: Simply pass
133 * the URL file:/// back.
135 | Some the_init_id ->
136 assert (the_init_id = xid);
137 raise Pxp_reader.Not_competent
138 (* If the channel is associated with a URL, the corresponding
139 * ID must be passed when the first invocation happens.
144 (* The task of channel_of_url:
145 * - If it is called the first time ("else"), the channel is returned
146 * - Otherwise, the channel is already being read, and thus cannot again
147 * opened. By raising Not_competent it is signaled that the
148 * resolve_as_file object must not continue to open the URL.
151 let channel_of_url url =
152 if !init_channel_done then
153 raise Pxp_reader.Not_competent
155 init_channel_done := true;
161 new Pxp_reader.resolve_as_file
162 ?system_encoding:system_encoding
164 ~channel_of_url:channel_of_url
172 (* Note: 'id' may be illegal (malformed); in this case, the first
173 * invocation of url_of_id will raise Not_competent, and the 'open_in'
183 let from_file ?system_encoding utf8_filename =
186 new Pxp_reader.resolve_as_file
187 ?system_encoding:system_encoding
191 let utf8_abs_filename =
192 if utf8_filename <> "" && utf8_filename.[0] = '/' then
195 Sys.getcwd() ^ "/" ^ utf8_filename
198 let syntax = { Neturl.ip_url_syntax with Neturl.url_accepts_8bits = true } in
199 let url = Neturl.make_url
202 ~path:(Neturl.split_path utf8_abs_filename)
206 let xid = System (Neturl.string_of_url url) in
213 let from_string ?fixenc s =
215 new Pxp_reader.resolve_read_this_string ?fixenc:fixenc s in
220 (**********************************************************************)
222 class ['ext] parser_object
223 init_doc init_dtd init_extend_dtd init_config init_resolver init_spec
224 init_process_xmldecl transform_dtd id_index
228 (* Note that the 'ext parameter has been the motivation to make the
232 val mutable dtd = init_dtd
233 (* The DTD being parsed; or the DTD currently assumed *)
235 val extend_dtd = init_extend_dtd
236 (* Whether the DTD should be extended by ELEMENT, ATTLIST, and
237 * NOTATION declarations or not. (True for validating mode,
238 * false for well-formedness mode.)
241 val transform_dtd = transform_dtd
242 (* A function transforming the DTD *)
244 val id_index = (id_index : 'ext index option)
245 (* The ID index or None *)
247 val process_xmldecl = init_process_xmldecl
248 (* Whether the XML declaration is parsed and the found XML version
249 * and standalone declaration are passed to 'doc'.
252 val lexerset = Pxp_lexers.get_lexer_set (init_config.encoding)
255 (* The current document *)
257 method doc = (doc : 'ext document)
259 val resolver = init_resolver
260 (* The resolver for external IDs *)
262 val config = init_config
263 (* The current configuration *)
265 val elstack = (Stack.create() : ('ext node * entity_id) Stack.t)
266 (* The element stack containing all open elements, i.e. elements that
267 * have begun by a start tag but that have not been finished (end tag).
268 * If the parser sees a start tag, it creates the element and pushes it
269 * on top of this stack. If the parser recognizes an end tag, it pulls
270 * one element from the stack and checks if it has the same name as
271 * given with the end tag.
273 * At initialization time, a special element is pushed on the stack,
274 * the so-called super root. It is always the bottommost
275 * element of the stack, and serves as a guard.
276 * [See "initializer" below.]
280 (* Get the top element of the element stack *)
282 fst(Stack.top elstack)
284 Stack.Empty -> assert false
285 (* Not possible, because the super root is always the element
286 * at the bottom of the stack.
289 val mutable n_tags_open = 0
290 (* Number of begin tags that have been parsed and whose corresponding
291 * end tags have not yet been parsed
294 val mutable p_internal_subset = false
295 (* true while parsing the internal subset - there are some additional
296 * constraints for internal subsets, and because of this it must
297 * be known whether the current declaration is contained in the
298 * internal or external subset of the DTD.
301 val mutable root = None
302 (* Contains the root element (topmost element) while it is being parsed
303 * and after it has been parsed.
304 * This variable is None before the root element is seen.
310 (* A hashtable that contains exemplar objects for the various element
311 * types. If an element is parsed, the exemplar is looked up and
312 * "cloned" (by the "create" method)
315 val mutable current_data = []
316 (* Collects character data. *)
318 method collect_data s =
319 (* Collects the character material 's' *)
320 current_data <- s :: current_data
323 (* Puts the material collected in 'current_data' into a new
324 * node, and appends this node as new sub node to 'current'
326 match current_data with
331 self # current # add_node (create_data_node spec dtd str);
334 let count = List.fold_left
335 (fun acc s -> acc + String.length s)
338 let str = String.create count in
339 let pos = ref count in
342 let l = String.length s in
354 self # current # add_node (create_data_node spec dtd str);
358 method only_whitespace data =
359 (* Checks that the string "data" contains only whitespace. On failure,
360 * Validation_error is raised.
362 let lexbuf = Lexing.from_string data in
363 let t1 = lexerset.scan_name_string lexbuf in
365 raise(WF_error("Data not allowed here"));
366 let t2 = lexerset.scan_name_string lexbuf in
368 raise(WF_error("Data not allowed here"));
373 if config.encoding <> dtd # encoding then
374 failwith("Encoding mismatch");
376 (* --- Initialize 'elstack': Push the super-root on the stack. *)
378 if config.enable_super_root_node then
379 create_super_root_node spec dtd
381 (* because spec may not contain an exemplar for the super root: *)
382 create_no_node spec dtd
384 (* Move the super root or the emulation to the stack: *)
385 Stack.push (super_root, (self :> entity_id)) elstack;
389 (********* Here the method "parse" begins. The grammar below is
390 * transformed to a local function of this method
393 method parse context start_symbol =
395 let parse_ignored_section yy_current yy_get_next =
396 (* A special parser which should be used after <![IGNORE[.
397 * It parses until the corresponding ]]> is found.
400 while yy_current() = Ignore do
401 ignore(yy_get_next());
404 ( match yy_current() with
405 Conditional_body _ -> ()
406 | _ -> raise Parsing.Parse_error;
409 let en = context.manager # current_entity in
412 let igntok = en # next_ignored_token in
413 (* next_ignored_token: uses a special lexer that only
414 * recognizes Conditional_begin and Conditional_end;
415 * other character combinations are ignored.
417 (* NOTE: next_ignored_token works much like yy_get_next,
418 * but it does not set the current token!
421 Conditional_begin _ ->
423 | Conditional_end _ ->
425 (* Because the loop may be exited now: *)
426 context.current_token <- igntok;
427 | (End_entity | Eof) ->
428 raise Parsing.Parse_error
436 let check_and_parse_xmldecl xmldecl =
437 if process_xmldecl then begin
438 let v, _, s = decode_doc_xml_pi (decode_xml_pi xmldecl) in
440 doc # init_xml_version v;
445 | _ -> raise (WF_error("Illegal 'standalone' declaration"))
447 if config.recognize_standalone_declaration then
448 dtd # set_standalone_declaration v
453 (* Recode 's' to UTF-8 *)
454 if config.encoding = `Enc_utf8 then
455 s (* No recoding necessary *)
457 Netconversion.recode_string
458 ~in_enc:(config.encoding :> encoding) ~out_enc:`Enc_utf8 s
464 /* The following grammar looks similar to ocamlyacc grammars, but
465 * ocamlyacc is actually not used to transform the grammar into a parser.
466 * Instead, the parser generator m2parsergen is applied.
468 * The format of the grammar is different (see m2parsergen/README),
469 * but I hope that you can understand most features immediately.
471 * The type of the parser is different: m2parsergen creates a top-down
472 * parser while ocamlyacc generates a LALR-1 parser.
474 * The way the generated code is called is different: ocamlyacc produces
475 * lots of top-level definitions whereas m2parsergen generates only
476 * a local let-in-phrase. This is explained in the already mentioned
480 /* See Pxp_types.ml for comments to the various tokens */
490 %token <> Conditional_begin
491 %token <> Conditional_body
492 %token <> Conditional_end
505 %token <> Comment_material
507 %token <> Doctype_rangle
510 %token <> Decl_element
511 %token <> Decl_attlist
512 %token <> Decl_entity
513 %token <> Decl_notation
514 %token <> Decl_rangle
519 %token <> RparenQmark
535 %token <> Attval_nl_normalized
536 %token <> Unparsed_string
540 * "ext_document": parses a complete XML document (i.e. containing a
541 * <!DOCTYPE..> and an element)
542 * "ext_declarations": parses an "external DTD subset", i.e. a sequence
544 * "ext_element": parses a single element (no <!DOCTYPE...> allowed);
545 * the element needs not to be the root element of the
548 * The functions corresponding to these symbols return always () because
549 * they only have side-effects.
552 /* SOME GENERAL COMMENTS:
554 * The parser does not get its tokens from the lexers directly. Instead of
555 * this, there is an entity object between the parser and the lexers. This
556 * object already handles:
558 * - References to general and parameter entities. The token stream is
559 * modified such that tokens automatically come from the referenced entities.
560 * External parameter entities and all general entities are embraced by
561 * the two special tokens Begin_entity and End_entity. The parser must
562 * check that these braces are correctly nested.
570 doc_xmldecl_then_misc_then_prolog_then_rest() End_entity
572 if n_tags_open <> 0 then
573 raise(WF_error("Missing end tag"))
577 /* In the following rule, we must find out whether there is an XML declaration
578 * or not, and directly after that either "process_xmldecl" or
579 * "process_missing_xmldecl" of the current entity must be called.
580 * AND IT MUST BE DIRECTLY! Because of this, the invocation is carried out
581 * in the "$" clause immediately following the first token.
583 * TODO: This is not enough. The first token may be a tag, and the tag
584 * may already contain non-ASCII characters. (But in this case, the resolvers
585 * assume UTF8, and they are right...)
588 doc_xmldecl_then_misc_then_prolog_then_rest():
590 $ {{ context.manager # current_entity # process_xmldecl pl;
591 check_and_parse_xmldecl pl;
593 misc()* doc_prolog_then_rest()
596 | $ {{ context.manager # current_entity # process_missing_xmldecl; }}
597 misc() misc()* doc_prolog_then_rest()
600 | $ {{ context.manager # current_entity # process_missing_xmldecl; }}
601 doctypedecl() misc()* contents_start()
604 | $ {{ context.manager # current_entity # process_missing_xmldecl; }}
609 doc_prolog_then_rest():
610 doctypedecl() misc()* contents_start()
617 Begin_entity el_xmldecl_then_misc_then_rest() End_entity
619 if n_tags_open <> 0 then
620 raise(WF_error("Missing end tag"))
624 /* See comment for doc_mldecl_then_misc_then_prolog_then_rest. */
626 el_xmldecl_then_misc_then_rest():
628 $ {{ context.manager # current_entity # process_xmldecl pl; }}
629 misc()* contents_start()
632 | $ {{ context.manager # current_entity # process_missing_xmldecl; }}
633 misc() misc()* contents_start()
636 | $ {{ context.manager # current_entity # process_missing_xmldecl; }}
642 /* Parses a sequence of declarations given by an entity. As side-effect,
643 * the parsed declarations are put into the dtd object.
645 Begin_entity decl_xmldecl_then_rest()
651 decl_xmldecl_then_rest():
652 /* Note: This rule is also called from declaration()! */
654 $ {{ context.manager # current_entity # process_xmldecl pl;
656 declaration()* End_entity
659 | $ {{ context.manager # current_entity # process_missing_xmldecl; }}
660 declaration() declaration()* End_entity
663 | $ {{ context.manager # current_entity # process_missing_xmldecl; }}
672 /* In this context, the lexers sometimes do not recognize white space;
673 * instead CharData tokens containing white space are delivered.
675 {{ self # only_whitespace data }}
682 /********************* DOCUMENT TYPE DECLARATION *************************/
685 /* parses from <!DOCTYPE to >. As side-effect, first the declarations of
686 * the internal DTD (if any) are put into !!on_dtd, then the declarations
687 * of the external DTD (if any) are put into this DTD object.
689 doctype_entid: Doctype
691 doctypedecl_material (doctype_entid)
693 ? {{ match !yy_position with
694 "ws" -> raise(WF_error("Whitespace is missing after `DOCTYPE'"))
695 | _ -> raise(WF_error("Bad DOCTYPE declaration"))
700 * ws: Ignore? Ignore*
701 * is meant seriously. The effect is that ws becomes a boolean variable
702 * which is true if there is an Ignore token and false otherwise.
703 * This construct is faster than just
705 * in which case ws becomes an integer variable containing the number of
706 * Ignore tokens. Counting the number of tokens is slower than only checking
709 * We need the information whether there is an Ignore token (representing
710 * white space), because white space is only obligatory if also an identifier
711 * for the external subset is parsed; this conditional syntax constraint is
712 * simply programmed in the body of the grammar rule.
715 doctypedecl_material(doctype_entid):
718 external_subset: external_id()?
720 internal_subset: internal_dtd()?
722 doctype_rangle_entid: Doctype_rangle
724 if doctype_entid != doctype_rangle_entid then
725 raise (Validation_error("Entities not properly nested with DOCTYPE declaration"));
726 dtd # set_root root_name;
727 begin match external_subset, internal_subset with
728 None, None -> () (* no DTD means no ID *)
729 | None, Some _ -> dtd # set_id Internal
730 | Some id, None -> dtd # set_id (External id)
731 | Some id, Some _ -> dtd # set_id (Derived id)
733 (* Get now the external doctype declaration. Note that the internal
734 * subset has precedence and must be read first.
736 begin match external_subset with
740 raise(WF_error("Whitespace is missing after `DOCTYPE " ^
742 let r' = resolver # clone in
745 (new document config.warner)
755 let en = new external_entity r' dtd "[dtd]"
756 config.warner id false config.errors_with_line_numbers
759 en # set_debugging_mode (config.debugging_mode);
760 let mgr = new entity_manager en in
761 en # open_entity true Declaration;
763 let context = make_context mgr in
764 pobj # parse context Ext_declarations;
765 ignore(en # close_entity);
768 ignore(en # close_entity);
770 let pos = mgr # position_string in
771 raise (At(pos, error))
776 match !yy_position with
777 "doctype_rangle_entid" -> raise(WF_error("`>' expected"))
778 | _ -> raise(WF_error("Bad DOCTYPE declaration"))
781 /* Note that there are no keywords for SYSTEM or PUBLIC, as these would
782 * be difficult to recognize in the lexical contexts. Because of this,
783 * SYSTEM/PUBLIC is parsed as name, and the rule for everything after
784 * SYSTEM/PUBLIC is computed dynamically.
792 "SYSTEM" -> parse_system_id
793 (* Apply the rule system_id (below) to parse the
796 | "PUBLIC" -> parse_public_id
797 (* Apply the rule public_id (below) to parse the
800 | _ -> raise(WF_error("SYSTEM or PUBLIC expected"))
806 ? {{ match !yy_position with
807 "ws" -> raise(WF_error("Whitespace is missing after " ^ tok))
808 | _ -> raise(WF_error("Bad SYSTEM or PUBLIC identifier"))
814 {{ System (recode_utf8 str) }}
818 str1: Unparsed_string
820 str2: Unparsed_string
821 {{ check_public_id str1;
822 Public(recode_utf8 str1, recode_utf8 str2)
824 ? {{ match !yy_position with
825 "ws" -> raise(WF_error("Whitespace is missing between the literals of the PUBLIC identifier"))
826 | _ -> raise(WF_error("Bad PUBLIC identifier"))
830 /* The internal subset: "[" declaration* "]". While parsing the declarations
831 * the object variable p_internal_subset must be true; however, if there
832 * are entity references, this variable must be reset to false during
833 * the entity. (See the rule for "declaration" below.)
837 dtd_begin_entid: internal_dtd_begin()
839 dtd_end_entid: internal_dtd_end()
841 if dtd_begin_entid != dtd_end_entid then
842 raise(Validation_error("Entities not properly nested with internal DTD subset"))
844 ? {{ match !yy_position with
845 "dtd_end_entid" -> raise(WF_error("`]' expected"))
846 | _ -> raise(WF_error("Bad internal DTD subset"))
850 internal_dtd_begin():
852 {{ assert (not p_internal_subset);
853 p_internal_subset <- true }}
858 {{ assert p_internal_subset;
859 p_internal_subset <- false }}
863 /* Parses a single declaration (or processing instruction). As side-effect
864 * the parsed declaration is stored into the dtd object.
870 | entid:Decl_entity ws:Ignore Ignore* e:entitydecl(entid)
872 ? {{ match !yy_position with
873 "ws" -> raise(WF_error("Whitespace is missing after ENTITY"))
874 | "e" -> raise(WF_error("Name or `%' expected"))
875 | _ -> raise(WF_error("Bad entity declaration"))
880 {{ let target, value = pi in
881 let pi = new proc_instruction target value config.encoding in
886 | Comment_begin Comment_material* ce:Comment_end
888 ? {{ match !yy_position with
889 "ce" -> raise(WF_error("`-->' expected"))
890 | _ -> raise(WF_error("Bad comment"))
893 $ {{ (* Set 'p_internal_subset' to 'false' until the matching 'end_entity'
894 * rule is parsed. This allows unrestricted usage of parameter entities
895 * within declarations of internal entities.
897 let old_p_internal_subset = p_internal_subset in
898 p_internal_subset <- false;
900 decl_xmldecl_then_rest()
901 {{ (* Restore the old value of 'p_internal_subset'. *)
902 p_internal_subset <- old_p_internal_subset;
905 | begin_entid:Conditional_begin
906 $ {{ (* Check whether conditional sections are allowed at this position. *)
907 if p_internal_subset then
908 raise(WF_error("Restriction of the internal subset: Conditional sections not allowed"));
911 cond:conditional_section() end_entid:Conditional_end
912 {{ (* Check whether Conditional_begin and Conditional_end are in the same
913 * entity. (This restriction is explained in the file SPECS.)
915 if begin_entid != end_entid then
916 raise(Validation_error("The first and the last token of conditional sections must be in the same entity (additional restriction of this parser)"));
918 ? {{ match !yy_position with
919 "end_entid" -> raise(WF_error("`>]>' expected"))
920 | "cond" -> raise(WF_error("INCLUDE or IGNORE expected"))
921 | _ -> raise(WF_error("Bad conditional section"))
924 /* The tokens INCLUDE/IGNORE are scanned as names, and the selection of the
925 * right parsing rule is dynamic.
926 * Note that parse_ignored_section is not defined by a grammar rule but
927 * by a conventional let-binding above.
930 conditional_section():
931 include_or_ignore:Name
932 $ {{ let parsing_function =
933 match include_or_ignore with
934 "INCLUDE" -> parse_included_section
935 (* invoke rule "included_section" below *)
936 | "IGNORE" -> parse_ignored_section
937 (* invoke function "parse_ignored_section" *)
938 | _ -> raise(WF_error("INCLUDE or IGNORE expected"))
941 [ parsing_function ] ()
943 ? {{ raise(WF_error("Bad conditional section")) }}
946 Conditional_body declaration()*
948 | Ignore Ignore* Conditional_body declaration()*
952 /*************************** ELEMENT DECLARATIONS ********************/
955 /* parses <!ELEMENT ... >. Puts the parsed element type as side-effect into
958 decl_element_entid: Decl_element
959 $ {{ let extdecl = context.manager # current_entity_counts_as_external in
964 content_model: contentspec()
966 decl_rangle_entid: Decl_rangle
968 if decl_element_entid != decl_rangle_entid then
969 raise (Validation_error "Entities not properly nested with ELEMENT declaration");
970 if extend_dtd then begin
971 let el = new dtd_element dtd name in
972 (* It is allowed that an <!ATTLIST...> precedes the corresponding
973 * <!ELEMENT...>. Because of this it is possible that there is already
974 * an element called 'name' in the DTD, and we only must set the content
975 * model of this element.
978 dtd # add_element el;
979 el # set_cm_and_extdecl content_model extdecl;
981 Not_found -> (* means: there is already an element 'name' *)
982 let el' = dtd # element name in
983 el' # set_cm_and_extdecl content_model extdecl;
984 (* raises Validation_error if el' already has a content model *)
987 ? {{ match !yy_position with
988 ("ws1"|"ws2") -> raise(WF_error("Whitespace is missing"))
989 | "name" -> raise(WF_error("The name of the element is expected here"))
990 | "content_model" -> raise(WF_error("Content model expression expected"))
991 | "decl_rangle_entid" -> raise(WF_error("`>' expected"))
992 | _ -> raise(WF_error("Bad element type declaration"))
996 /* parses a content model and returns it (type content_model_type) */
997 name: Name /* EMPTY or ANY */
1001 | _ -> raise(WF_error("EMPTY, ANY, or a subexpression expected"))
1003 | entid:Lparen Ignore* term:mixed_or_regexp(entid)
1005 ? {{ raise(WF_error("Bad content model expression")) }}
1008 /* Many of the following rules have an lparen_entid argument. This is the
1009 * internal ID of the entity containing the corresponding left parenthesis;
1010 * by comparing it with the ID of the entity of the right parenthesis the
1011 * contraint is implemented that both parentheses must be in the same entity.
1014 mixed_or_regexp(lparen_entid):
1015 re: choice_or_seq(lparen_entid)
1017 | m: mixed(lparen_entid)
1022 /* returns one of the multiplier symbols (?,*,+) */
1031 mixed (lparen_entid) :
1034 material: mixed_alternatives_top()
1036 let rest, rparen_entid = material in
1037 if lparen_entid != rparen_entid then
1038 raise (Validation_error "Entities not properly nested with parentheses");
1039 Mixed (MPCDATA :: rest)
1041 ? {{ raise(WF_error("Bad content model expression")) }}
1044 mixed_alternatives_top():
1049 | Bar Ignore* name:Name Ignore* names:mixed_alternative()* entid:RparenStar
1051 (MChild name :: names), entid
1053 ? {{ match !yy_position with
1054 "name" -> raise(WF_error("Name expected"))
1055 | "entid" -> raise(WF_error("`)*' expected"))
1056 | _ -> raise(WF_error("Bad content model expression"))
1060 mixed_alternative() :
1061 Bar Ignore* name:Name Ignore*
1063 ? {{ match !yy_position with
1064 "name" -> raise(WF_error("Name expected"))
1065 | _ -> raise(WF_error("Bad content model expression"))
1070 choice_or_seq (lparen_entid):
1071 /* parses either a regular expression, or a mixed expression. Returns
1072 * Mixed spec or Regexp spec (content_model_type).
1073 * Which kind of expression (regexp or mixed) is being read is recognized
1074 * after the first subexpression has been parsed; the other subexpressions
1075 * must be of the same kind.
1079 factor: choice_or_seq_factor()
1081 let (finalmark,subexpr), rparen_entid = factor in
1082 if lparen_entid != rparen_entid then
1083 raise (Validation_error "Entities not properly nested with parentheses");
1084 (* Check that the other subexpressions are "regexp", too, and
1085 * merge them with the first.
1090 | Alt alt -> Alt (re :: alt)
1091 | Seq seq -> Seq (re :: seq)
1094 (* Interpret the finalmark. *)
1095 match finalmark with
1097 | Plus -> Repeated1 re'
1098 | Star -> Repeated re'
1099 | Qmark -> Optional re'
1102 ? {{ raise(WF_error("Bad content model expression")) }}
1104 choice_or_seq_factor():
1105 /* Parses "|<subexpr>|...)" or ",<subexpr>,...)", both forms optionally
1106 * followed by ?, *, or +.
1107 * Returns ((finalmark, expr), rparen_entid), where
1108 * - finalmark is the character after the right parenthesis or Ignore
1110 * Alt [] meaning that only ")" has been found
1111 * Alt non_empty_list meaning that the subexpressions are separated by '|'
1112 * Seq non_empty_list meaning that the subexpressions are separated by ','
1115 {{ (Ignore, Alt []), entid }}
1117 {{ (Plus, Alt []), entid }}
1119 {{ (Star, Alt []), entid }}
1121 {{ (Qmark, Alt []), entid }}
1122 | Bar Ignore* re:cp() Ignore* factor:choice_or_seq_factor()
1124 let (finalmark, subexpr), rparen_entid = factor in
1125 begin match subexpr with
1126 Alt [] -> (finalmark, (Alt [re])), rparen_entid
1127 | Alt alt -> (finalmark, (Alt (re :: alt))), rparen_entid
1128 | _ -> raise(WF_error("It is not allowed to mix alternatives and sequences"))
1131 ? {{ raise(WF_error("Bad content model expression")) }}
1132 | Comma Ignore* re:cp() Ignore* factor:choice_or_seq_factor()
1134 let (finalmark, subexpr), rparen_entid = factor in
1135 begin match subexpr with
1136 Alt [] -> (finalmark, (Seq [re])), rparen_entid
1137 | Seq seq -> (finalmark, (Seq (re :: seq))), rparen_entid
1138 | _ -> raise(WF_error("It is not allowed to mix alternatives and sequences"))
1141 ? {{ raise(WF_error("Bad content model expression")) }}
1144 /* parse either a name, or a parenthesized subexpression "(...)" */
1145 name:Name m:multiplier()?
1148 | Some Plus -> Repeated1 (Child name)
1149 | Some Star -> Repeated (Child name)
1150 | Some Qmark -> Optional (Child name)
1153 ? {{ raise(WF_error("Bad content model expression")) }}
1154 | entid:Lparen Ignore* m:choice_or_seq(entid)
1156 ? {{ raise(WF_error("Bad content model expression")) }}
1159 /********************* ATTRIBUTE LIST DECLARATION ***********************/
1162 /* parses <!ATTLIST ... >. Enters the attribute list in dtd as side-
1165 decl_attlist_entid: Decl_attlist
1166 $ {{ let extdecl = context.manager # current_entity_counts_as_external in
1171 factor: attdef_factor()
1173 let at_list, decl_rangle_entid = factor in
1175 if decl_attlist_entid != decl_rangle_entid then
1176 raise (Validation_error "Entities not properly nested with ATTLIST declaration");
1178 if not ws && at_list <> [] then begin
1181 (* This is normally impossible, because the lexer demands
1182 * some other token between two names.
1184 raise(WF_error("Whitespace is missing before `" ^ name ^ "'"));
1188 if extend_dtd then begin
1189 let new_el = new dtd_element dtd el_name in
1190 (* Note that it is allowed that <!ATTLIST...> precedes the corresponding
1191 * <!ELEMENT...> declaration. In this case we add the element declaration
1192 * already to the DTD but leave the content model unspecified.
1196 dtd # add_element new_el;
1199 Not_found -> (* already added *)
1200 let old_el = dtd # element el_name in
1201 if old_el # attribute_names <> [] then
1202 config.warner # warn ("More than one ATTLIST declaration for element type `" ^
1207 (fun (a_name, a_type, a_default) ->
1208 el # add_attribute a_name a_type a_default extdecl)
1212 ? {{ match !yy_position with
1213 "ws1" -> raise(WF_error("Whitespace is missing after ATTLIST"))
1214 | "el_name" -> raise(WF_error("The name of the element is expected here"))
1215 | "factor" -> raise(WF_error("Another attribute name or `>' expected"))
1216 | _ -> raise(WF_error("Bad attribute declaration"))
1221 /* parses a list of triples <name> <type> <default value> and returns the
1222 * list as (string * att_type * att_default) list.
1224 attdef:attdef() ws:Ignore? Ignore* factor:attdef_factor()
1226 let attdef_rest, decl_rangle_entid = factor in
1227 if not ws && attdef_rest <> [] then begin
1228 match attdef_rest with
1230 raise(WF_error("Missing whitespace before `" ^ name ^ "'"));
1233 (attdef :: attdef_rest), decl_rangle_entid }}
1234 ? {{ match !yy_position with
1235 | "factor" -> raise(WF_error("Another attribute name or `>' expected"))
1236 | _ -> raise(WF_error("Bad attribute declaration"))
1243 /* Parses a single triple */
1248 default: defaultdecl()
1249 {{ (name,tp,default) }}
1250 ? {{ match !yy_position with
1251 ("ws1"|"ws2") -> raise(WF_error("Whitespace is missing"))
1252 | "tp" -> raise(WF_error("Type of attribute or `(' expected"))
1253 | "default" -> raise(WF_error("#REQUIRED, #IMPLIED, #FIXED or a string literal expected"))
1254 | _ -> raise(WF_error("Bad attribute declaration"))
1258 /* Parses an attribute type and returns it as att_type. */
1261 if name = "NOTATION" then
1272 | "IDREF" -> A_idref
1273 | "IDREFS" -> A_idrefs
1274 | "ENTITY" -> A_entity
1275 | "ENTITIES" -> A_entities
1276 | "NMTOKEN" -> A_nmtoken
1277 | "NMTOKENS" -> A_nmtokens
1280 None -> raise(WF_error("Error in NOTATION type (perhaps missing whitespace after NOTATION?)"))
1283 | _ -> raise(WF_error("One of CDATA, ID, IDREF, IDREFS, ENTITY, ENTITIES, NMTOKEN, NMTOKENS, NOTATION, or a subexpression expected"))
1285 ? {{ raise(WF_error("Bad attribute declaration (perhaps missing whitespace after NOTATION)")) }}
1289 name: name_or_nametoken()
1291 names: nmtoken_factor()*
1294 {{ A_enum(name :: names) }}
1295 ? {{ match !yy_position with
1296 "name" -> raise(WF_error("Name expected"))
1297 | "names" -> raise(WF_error("`|' and more names expected, or `)'"))
1298 | "rp" -> raise(WF_error("`|' and more names expected, or `)'"))
1299 | _ -> raise(WF_error("Bad enumeration type"))
1304 /* The always failing rule */
1305 $ {{ raise Not_found; }}
1306 Doctype /* questionable */
1307 {{ A_cdata (* Does not matter *)
1317 names: notation_factor()*
1319 {{ A_notation(name :: names) }}
1320 ? {{ match !yy_position with
1321 "lp" -> raise(WF_error("`(' expected"))
1322 | "name" -> raise(WF_error("Name expected"))
1323 | "names" -> raise(WF_error("`|' and more names expected, or `)'"))
1324 | "rp" -> raise(WF_error("`|' and more names expected, or `)'"))
1325 | _ -> raise(WF_error("Bad NOTATION type"))
1330 /* Parse "|<name>" and return the name */
1331 Bar Ignore* name:Name Ignore*
1333 ? {{ match !yy_position with
1334 "name" -> raise(WF_error("Name expected"))
1335 | _ -> raise(WF_error("Bad NOTATION type"))
1339 /* Parse "|<nmtoken>" and return the nmtoken */
1340 Bar Ignore* n:name_or_nametoken() Ignore*
1342 ? {{ match !yy_position with
1343 "n" -> raise(WF_error("Nametoken expected"))
1344 | _ -> raise(WF_error("Bad enumeration type"))
1348 name_or_nametoken():
1350 | n:Nametoken {{ n }}
1353 /* The default values must be expanded and normalized. This has been implemented
1354 * by the function expand_attvalue.
1359 /* Parse the default value for an attribute and return it as att_default */
1364 | Fixed ws:Ignore Ignore* str:Unparsed_string
1365 {{ D_fixed (expand_attvalue lexerset dtd str config.warner false) }}
1366 ? {{ match !yy_position with
1367 "ws" -> raise(WF_error("Whitespace is missing after #FIXED"))
1368 | "str" -> raise(WF_error("String literal expected"))
1369 | _ -> raise(WF_error("Bad #FIXED default value"))
1371 | str:Unparsed_string
1372 {{ D_default (expand_attvalue lexerset dtd str config.warner false) }}
1375 /**************************** ENTITY DECLARATION ***********************/
1377 entitydecl(decl_entity_entid):
1378 /* parses everything _after_ <!ENTITY until the matching >. The parsed
1379 * entity declaration is entered into the dtd object as side-effect.
1382 $ {{ let extdecl = context.manager # current_entity_counts_as_external in
1385 material: entitydef()
1387 decl_rangle_entid: Decl_rangle
1388 /* A general entity */
1390 if decl_entity_entid != decl_rangle_entid then
1391 raise (Validation_error "Entities not properly nested with ENTITY declaration");
1393 (* Distinguish between
1394 * - internal entities
1395 * - external entities
1396 * - NDATA (unparsed) entities
1399 (Some s, None, None) ->
1400 new internal_entity dtd name config.warner s p_internal_subset
1401 config.errors_with_line_numbers false config.encoding
1402 | (None, Some xid, None) ->
1403 new external_entity (resolver # clone) dtd name config.warner
1404 xid false config.errors_with_line_numbers
1407 | (None, Some xid, Some n) ->
1408 (new ndata_entity name xid n config.encoding :> entity)
1411 dtd # add_gen_entity en extdecl
1413 ? {{ match !yy_position with
1414 "ws" -> raise(WF_error("Whitespace is missing"))
1415 | "material" -> raise(WF_error("String literal or identifier expected"))
1416 | "decl_rangle_entid" -> raise(WF_error("`>' expected"))
1417 | _ -> raise(WF_error("Bad entity declaration"))
1421 $ {{ let extdecl = context.manager # current_entity_counts_as_external in
1428 decl_rangle_entid: Decl_rangle
1429 /* A parameter entity */
1431 if decl_entity_entid != decl_rangle_entid then
1432 raise (Validation_error "Entities not properly nested with ENTITY declaration");
1434 (* Distinguish between internal and external entities *)
1437 new internal_entity dtd name config.warner s p_internal_subset
1438 config.errors_with_line_numbers true config.encoding
1439 | (None, Some xid) ->
1440 new external_entity (resolver # clone) dtd name config.warner
1441 xid true config.errors_with_line_numbers
1446 (* The following two lines force that even internal entities count
1447 * as external (for the standalone check) if the declaration of
1448 * the internal entity occurs in an external entity.
1451 en # set_counts_as_external;
1453 dtd # add_par_entity en;
1455 ? {{ match !yy_position with
1456 ("ws1"|"ws2") -> raise(WF_error("Whitespace is missing"))
1457 | "material" -> raise(WF_error("String literal or identifier expected"))
1458 | "decl_rangle_entid" -> raise(WF_error("`>' expected"))
1459 | _ -> raise(WF_error("Bad entity declaration"))
1464 /* parses the definition value of a general entity. Returns either:
1465 * - (Some s, None, None) meaning the definition of an internal entity
1466 * with (literal) value s has been found
1467 * - (None, Some x, None) meaning that an external parsed entity with
1468 * external ID x has been found
1469 * - (None, Some x, Some n) meaning that an unparsed entity with
1470 * external ID x and notations n has been found
1473 {{ Some str, None, None }}
1474 | id:external_id() ws:Ignore? Ignore* decl:ndatadecl()?
1475 {{ if not ws && decl <> None then
1476 raise(WF_error("Whitespace missing before `NDATA'"));
1482 /* parses the definition value of a parameter entity. Returns either:
1483 * - (Some s, None) meaning that the definition of an internal entity
1484 * with (literal) value s has been found
1485 * - (None, Some x) meaning that an external ID x has been found
1488 {{ Some str, None }}
1494 /* Parses either NDATA "string" or the empty string; returns Some "string"
1495 * in the former, None in the latter case.
1497 ndata:Name ws:Ignore Ignore* name:Name
1498 {{ if ndata = "NDATA" then
1501 raise(WF_error("NDATA expected"))
1503 ? {{ match !yy_position with
1504 "ws" -> raise(WF_error("Whitespace is missing after NDATA"))
1505 | "name" -> raise(WF_error("Name expected"))
1506 | _ -> raise(WF_error("Bad NDATA declaration"))
1509 /**************************** NOTATION DECLARATION *******************/
1512 /* parses <!NOTATION ... > and enters the notation declaration into the
1513 * dtd object as side-effect
1515 decl_notation_entid: Decl_notation
1519 sys_or_public: Name /* SYSTEM or PUBLIC */
1521 str1: Unparsed_string
1523 str2: Unparsed_string?
1525 decl_rangle_entid: Decl_rangle
1527 if decl_notation_entid != decl_rangle_entid then
1528 raise (Validation_error "Entities not properly nested with NOTATION declaration");
1530 (* Note that it is allowed that PUBLIC is only followed by one
1533 match sys_or_public with
1535 if str2 <> None then raise(WF_error("SYSTEM must be followed only by one argument"));
1536 System (recode_utf8 str1)
1538 begin match str2 with
1540 check_public_id str1;
1541 Public(recode_utf8 str1,"")
1544 raise(WF_error("Missing whitespace between the string literals of the `PUBLIC' id"));
1545 check_public_id str1;
1546 Public(recode_utf8 str1, recode_utf8 p)
1548 | _ -> raise(WF_error("PUBLIC or SYSTEM expected"))
1550 if extend_dtd then begin
1551 let no = new dtd_notation name xid config.encoding in
1552 dtd # add_notation no
1555 ? {{ match !yy_position with
1556 ("ws1"|"ws2"|"ws3") -> raise(WF_error("Whitespace is missing"))
1557 | "name" -> raise(WF_error("Name expected"))
1558 | "sys_or_public" -> raise(WF_error("SYSTEM or PUBLIC expected"))
1559 | ("str1"|"str2") -> raise(WF_error("String literal expected"))
1560 | "decl_rangle_entid" -> raise(WF_error("`>' expected"))
1561 | _ -> raise(WF_error("Bad NOTATION declaration"))
1564 /****************************** ELEMENTS **************************/
1566 /* In the following rules, the number of error rules is reduced to
1567 * improve the performance of the parser.
1572 /* parses <element>...</element> misc*, i.e. exactly one element followed
1573 * optionally by white space or processing instructions.
1574 * The element is entered into the global variables as follows:
1575 * - If elstack is non-empty, the parsed element is added as new child to
1576 * the top element of the stack.
1577 * - If elstack is empty, the root_examplar object is modified rather than
1578 * that a new element is created. If additionally the variable root is
1579 * None, it is assigned Some root_examplar.
1580 * Note that the modification of the root_exemplar is done by the method
1582 * The reason why the root element is modified rather than newly created
1583 * is a typing requirement. It must be possible that the class of the root
1584 * is derived from the original class element_impl, i.e. the user must be
1585 * able to add additional methods. If we created a new root object, we
1586 * would have to denote to which class the new object belongs; the root
1587 * would always be an 'element_impl' object (and not a derived object).
1588 * If we instead cloned an exemplar object and modified it by the
1589 * "create" method, the root object would belong to the same class as the
1590 * exemplar (good), but the type of the parsing function would always
1591 * state that an 'element_impl' was created (because we can pass the new
1592 * object only back via a global variable). The only solution is to
1593 * modify the object that has been passed to the parsing function directly.
1595 $ {{ dtd <- transform_dtd dtd; }}
1596 start_tag() content()*
1601 /* parses: start tags, end tags, content, or processing
1602 * instructions. That the tags are properly nested is dynamically checked.
1603 * As result, recognized elements are added to their parent elements,
1604 * content is added to the element containing it, and processing instructions
1605 * are entered into the element embracing them. (All as side-effects.)
1624 Begin_entity eref_xmldecl_then_rest()
1625 {{ if n_tags_open = 0 then
1626 raise(WF_error("Entity reference not allowed here"))
1630 /* See comment for doc_mldecl_then_misc_then_prolog_then_rest. */
1632 eref_xmldecl_then_rest():
1634 $ {{ context.manager # current_entity # process_xmldecl pl;
1636 content()* End_entity
1639 | $ {{ context.manager # current_entity # process_missing_xmldecl; }}
1640 content() content()* End_entity
1643 | $ {{ context.manager # current_entity # process_missing_xmldecl; }}
1649 /* parses <element attribute-values> or <element attribute-values/>.
1651 * EFFECT: If elstack is non-empty, the element is added to the
1652 * top element of the stack as new child, and the element
1653 * is pushed on the stack. If elstack is empty, the root_exemplar is
1654 * modified and gets the parsed name and attribute list. The root_exemplar
1655 * is pushed on the stack. If additionally the variable root is empty, too,
1656 * this variable is initialized.
1657 * If the <element ... /> form has been parsed, no element is pushed
1662 if config.store_element_positions then
1663 Some(context.manager # position)
1669 attlist: attribute()*
1670 emptiness: start_tag_rangle()
1671 /* Note: it is guaranteed that there is whitespace between Tag_beg and
1672 * the name of the first attribute, because there must be some separator.
1673 * So we need not to check ws!
1676 let rec check_attlist al =
1678 (nv1, num1) :: al' ->
1679 if not num1 && al' <> [] then begin
1681 ((n1,_),_) :: ((n2,_),_) :: _ ->
1682 raise(WF_error("Whitespace is missing between attributes `" ^
1683 n1 ^ "' and `" ^ n2 ^ "'"))
1689 check_attlist attlist;
1691 let name, tag_beg_entid = tag in
1692 let attlist' = List.map (fun (nv,_) -> nv) attlist in
1694 create_element_node ?position:position spec dtd name attlist' in
1696 begin match id_index with
1699 (* Put the ID attribute into the index, if present *)
1701 let v = d # id_attribute_value in (* may raise Not_found *)
1702 idx # add v d (* may raise ID_not_unique *)
1705 (* No ID attribute *)
1708 (* There is already an ID with the same value *)
1709 raise(Validation_error("ID not unique"))
1713 if n_tags_open = 0 then begin
1714 if root = None then begin
1715 (* We have found the begin tag of the root element. *)
1716 if config.enable_super_root_node then begin
1717 (* The user wants the super root instead of the real root.
1718 * The real root element becomes the child of the VR.
1720 (* Assertion: self # current is the super root *)
1721 assert (self # current # node_type = T_super_root);
1722 root <- Some (self # current);
1723 self # current # add_node d;
1724 doc # init_root (self # current);
1727 (* Normal behaviour: The user wants to get the real root. *)
1733 (* We have found a second topmost element. This is illegal. *)
1734 raise(WF_error("Document must consist of only one toplevel element"))
1737 (* We have found some inner begin tag. *)
1738 self # save_data; (* Save outstanding data material first *)
1739 self # current # add_node d
1743 (* An empty tag like <a/>. *)
1744 d # local_validate ~use_dfa:config.validate_by_dfa ()
1746 (* A non-empty tag. *)
1747 Stack.push (d, tag_beg_entid) elstack;
1748 n_tags_open <- n_tags_open + 1;
1751 ? {{ match !yy_position with
1752 "attlist" -> raise(WF_error("Bad attribute list"))
1753 | "emptiness" -> raise(WF_error("`>' or `/>' expected"))
1754 | _ -> raise(WF_error("Bad start tag"))
1759 /* Parses name="value" */
1760 n:Name Ignore* Eq Ignore* v:attval() ws:Ignore? Ignore*
1766 {{ expand_attvalue lexerset dtd v config.warner true }}
1767 | v:Attval_nl_normalized
1768 {{ expand_attvalue lexerset dtd v config.warner false }}
1773 | Rangle_empty {{ true }}
1777 /* parses </element>.
1778 * Pops the top element from the elstack and checks if it is the same
1781 tag:Tag_end Ignore* Rangle
1782 {{ let name, tag_end_entid = tag in
1783 if n_tags_open = 0 then
1784 raise(WF_error("End-tag without start-tag"));
1786 self # save_data; (* Save outstanding data material first *)
1788 let x, tag_beg_entid = Stack.pop elstack in
1790 match x # node_type with
1794 if name <> x_name then
1795 raise(WF_error("End-tag does not match start-tag"));
1796 if tag_beg_entid != tag_end_entid then
1797 raise(WF_error("End-tag not in the same entity as the start-tag"));
1798 x # local_validate ~use_dfa:config.validate_by_dfa ();
1800 n_tags_open <- n_tags_open - 1;
1802 assert (n_tags_open >= 0);
1807 /* Parses any literal characters not otherwise matching, and adds the
1808 * characters to the top element of elstack.
1809 * If elstack is empty, it is assumed that there is no surrounding
1810 * element, and any non-white space character is forbidden.
1814 if n_tags_open = 0 then
1815 (* only white space is allowed *)
1816 self # only_whitespace data
1818 self # collect_data data
1819 (* We collect the chardata material until the next end tag is
1820 * reached. Then the collected material will concatenated and
1821 * stored as a single T_data node (see end_tag rule above)
1827 if n_tags_open = 0 then
1828 raise (WF_error("CDATA section not allowed here"));
1829 self # collect_data data
1830 (* Also collect CDATA material *)
1834 /* Parses &#...; and adds the character to the top element of elstack. */
1837 if n_tags_open = 0 then
1838 (* No surrounding element: character references are not allowed *)
1839 raise(WF_error("Character reference not allowed here"));
1840 self # collect_data (character config.encoding config.warner code)
1841 (* Also collect character references *)
1845 /* Parses <?...?> (but not <?xml white-space ... ?>).
1846 * If there is a top element in elstack, the processing instruction is added
1852 if config.store_element_positions then
1853 Some(context.manager # position)
1857 let target,value = pi in
1859 if n_tags_open = 0 & not config.enable_super_root_node
1861 doc # add_pinstr (new proc_instruction target value config.encoding)
1863 (* Special case: if processing instructions are processed inline,
1864 * they are wrapped into T_pinstr nodes.
1866 if config.enable_pinstr_nodes then begin
1867 self # save_data; (* Save outstanding data material first *)
1868 let pinstr = new proc_instruction target value config.encoding in
1869 let wrapper = create_pinstr_node
1870 ?position:position spec dtd pinstr in
1871 wrapper # local_validate(); (* succeeds always *)
1872 self # current # add_node wrapper;
1875 (* Normal behaviour: Add the PI to the parent element. *)
1876 self # current # add_pinstr
1877 (new proc_instruction target value config.encoding)
1883 /* Parses <!-- ... -->
1888 if config.enable_comment_nodes && config.store_element_positions then
1889 Some(context.manager # position)
1894 mat: Comment_material*
1897 if config.enable_comment_nodes then begin
1898 self # save_data; (* Save outstanding data material first *)
1899 let comment_text = String.concat "" mat in
1900 let wrapper = create_comment_node
1901 ?position:position spec dtd comment_text in
1902 wrapper # local_validate(); (* succeeds always *)
1903 self # current # add_node wrapper;
1906 ? {{ match !yy_position with
1907 | "ce" -> raise(WF_error("`-->' expected"))
1908 | _ -> raise(WF_error("Bad comment"))
1913 (* The method "parse" continues here... *)
1916 match start_symbol with
1918 parse_ext_document context.current context.get_next
1919 | Ext_declarations ->
1920 parse_ext_declarations context.current context.get_next
1922 parse_ext_element context.current context.get_next
1925 raise Parsing.Parse_error
1927 (*********** The method "parse" ends here *************)
1930 (**********************************************************************)
1932 (* Here ends the class definition: *)
1936 (**********************************************************************)
1943 val mutable node = (None : ('a extension node as 'a) option)
1944 method clone = {< >}
1956 let default_extension = new default_ext;;
1959 make_spec_from_mapping
1960 ~super_root_exemplar: (new element_impl default_extension)
1961 ~comment_exemplar: (new element_impl default_extension)
1962 ~default_pinstr_exemplar: (new element_impl default_extension)
1963 ~data_exemplar: (new data_impl default_extension)
1964 ~default_element_exemplar: (new element_impl default_extension)
1965 ~element_mapping: (Hashtbl.create 1)
1970 let idref_pass id_index root =
1971 let error t att value =
1973 match t # node_type with
1974 T_element name -> name
1978 "Attribute `" ^ att ^ "' of element `" ^ name ^
1979 "' refers to unknown ID `" ^ value ^ "'" in
1980 let pos_ent, pos_line, pos_col = t # position in
1981 if pos_line = 0 then
1982 raise(Validation_error text)
1984 raise(At("In entity " ^ pos_ent ^ " at line " ^
1985 string_of_int pos_line ^ ", position " ^ string_of_int pos_col ^
1987 Validation_error text))
1990 let rec check_tree t =
1991 let idref_atts = t # idref_attribute_names in
1994 match t # attribute att with
1996 begin try ignore(id_index # find s) with
2003 try ignore(id_index # find s) with
2008 | Implied_value -> ()
2011 List.iter check_tree (t # sub_nodes)
2017 exception Return_DTD of dtd;;
2018 (* Used by extract_dtd_from_document_entity to jump out of the parser *)
2021 let call_parser ~configuration:cfg
2029 ~(id_index : 'ext #index option)
2030 ~use_document_entity
2033 let e = cfg.errors_with_line_numbers in
2034 let w = cfg.warner in
2037 Entity(m,r') -> r', m dtd
2038 | ExtID(xid,r') -> r',
2039 if use_document_entity then
2041 r' dtd "[toplevel]" w xid e
2045 r' dtd "[toplevel]" w xid false e
2048 r # init_rep_encoding cfg.encoding;
2050 en # set_debugging_mode (cfg.debugging_mode);
2061 (id_index :> 'ext index option)
2063 let mgr = new entity_manager en in
2064 en # open_entity true init_lexer;
2066 let context = make_context mgr in
2067 pobj # parse context entry;
2068 ignore(en # close_entity);
2071 ignore(en # close_entity);
2074 ignore(en # close_entity);
2076 let pos = mgr # position_string in
2077 raise (At(pos, error))
2079 if cfg.idref_pass then begin
2083 ( match pobj # root with
2086 idref_pass idx root;
2092 let parse_dtd_entity cfg src =
2093 (* Parse a DTD given as separate entity. *)
2094 let dtd = new dtd cfg.warner cfg.encoding in
2095 let doc = new document cfg.warner in
2101 ~extensible_dtd:true (* Extend the DTD by parsed declarations *)
2103 ~specification:default_spec
2104 ~process_xmldecl:false (* The XML declaration is ignored
2105 * (except 'encoding')
2107 ~transform_dtd:(fun x -> x) (* Do not transform the DTD *)
2109 ~use_document_entity:false
2110 ~entry:Ext_declarations (* Entry point of the grammar *)
2111 ~init_lexer:Declaration (* The initially used lexer *)
2114 if cfg.accept_only_deterministic_models then dtd # only_deterministic_models;
2119 let parse_content_entity ?id_index cfg src dtd spec =
2120 (* Parse an element given as separate entity *)
2121 dtd # validate; (* ensure that the DTD is valid *)
2122 if cfg.accept_only_deterministic_models then dtd # only_deterministic_models;
2123 let doc = new document cfg.warner in
2129 ~extensible_dtd:true (* Extend the DTD by parsed declarations *)
2132 ~process_xmldecl:false (* The XML declaration is ignored
2133 * (except 'encoding')
2135 ~transform_dtd:(fun x -> x) (* Do not transform the DTD *)
2136 ~id_index:(id_index :> 'ext index option)
2137 ~use_document_entity:false
2138 ~entry:Ext_element (* Entry point of the grammar *)
2139 ~init_lexer:Content (* The initially used lexer *)
2141 match pobj # root with
2143 | None -> raise(WF_error("No root element"))
2147 let parse_wfcontent_entity cfg src spec =
2148 let dtd = new dtd cfg.warner cfg.encoding in
2149 dtd # allow_arbitrary;
2150 let doc = new document cfg.warner in
2156 ~extensible_dtd:false (* Do not extend the DTD *)
2159 ~process_xmldecl:false (* The XML declaration is ignored
2160 * (except 'encoding')
2162 ~transform_dtd:(fun x -> x) (* Do not transform the DTD *)
2164 ~use_document_entity:false
2165 ~entry:Ext_element (* Entry point of the grammar *)
2166 ~init_lexer:Content (* The initially used lexer *)
2168 match pobj # root with
2170 | None -> raise(WF_error("No root element"))
2174 let iparse_document_entity ?(transform_dtd = (fun x -> x))
2176 cfg0 src spec p_wf =
2177 (* Parse an element given as separate entity *)
2178 (* p_wf: 'true' if in well-formedness mode, 'false' if in validating mode *)
2179 let cfg = { cfg0 with
2180 recognize_standalone_declaration =
2181 cfg0.recognize_standalone_declaration && (not p_wf)
2183 let dtd = new dtd cfg.warner cfg.encoding in
2185 dtd # allow_arbitrary;
2186 let doc = new document cfg.warner in
2192 ~extensible_dtd:(not p_wf) (* Extend the DTD by parsed declarations
2193 * only if in validating mode
2197 ~process_xmldecl:true (* The XML declaration is processed *)
2198 (* TODO: change to 'not p_wf' ? *)
2199 ~transform_dtd:(fun dtd ->
2200 let dtd' = transform_dtd dtd in
2201 if cfg.accept_only_deterministic_models then
2202 dtd' # only_deterministic_models;
2205 ~id_index:(id_index :> 'ext index option)
2206 ~use_document_entity:true
2207 ~entry:Ext_document (* Entry point of the grammar *)
2208 ~init_lexer:Document (* The initially used lexer *)
2214 let parse_document_entity ?(transform_dtd = (fun x -> x))
2217 iparse_document_entity
2218 ~transform_dtd:transform_dtd
2219 ?id_index:(id_index : 'ext #index option :> 'ext index option)
2220 cfg src spec false;;
2222 let parse_wfdocument_entity cfg src spec =
2223 iparse_document_entity cfg src spec true;;
2225 let extract_dtd_from_document_entity cfg src =
2226 let transform_dtd dtd = raise (Return_DTD dtd) in
2228 let doc = parse_document_entity
2229 ~transform_dtd:transform_dtd
2233 (* Should not happen: *)
2237 (* The normal case: *)
2242 let default_config =
2243 let w = new drop_warnings in
2245 errors_with_line_numbers = true;
2246 enable_pinstr_nodes = false;
2247 enable_super_root_node = false;
2248 enable_comment_nodes = false;
2249 encoding = `Enc_iso88591;
2250 recognize_standalone_declaration = true;
2251 store_element_positions = true;
2253 validate_by_dfa = true;
2254 accept_only_deterministic_models = true;
2255 debugging_mode = false;
2259 class [ 'ext ] hash_index =
2261 constraint 'ext = 'ext node #extension
2262 val ht = (Hashtbl.create 100 : (string, 'ext node) Hashtbl.t)
2265 ignore(Hashtbl.find ht s);
2271 method find s = Hashtbl.find ht s
2276 (* ======================================================================
2280 * Revision 1.1 2000/11/17 09:57:29 lpadovan
2283 * Revision 1.14 2000/08/26 23:23:14 gerd
2284 * Bug: from_file must not interpret the file name as URL path.
2285 * Bug: When PI and comment nodes are generated, the collected data
2286 * material must be saved first.
2288 * Revision 1.13 2000/08/19 21:30:03 gerd
2289 * Improved the error messages of the parser
2291 * Revision 1.12 2000/08/18 20:16:25 gerd
2292 * Implemented that Super root nodes, pinstr nodes and comment
2293 * nodes are included into the document tree.
2295 * Revision 1.11 2000/08/14 22:24:55 gerd
2296 * Moved the module Pxp_encoding to the netstring package under
2297 * the new name Netconversion.
2299 * Revision 1.10 2000/07/23 02:16:33 gerd
2302 * Revision 1.9 2000/07/14 13:57:29 gerd
2303 * Added the id_index feature.
2305 * Revision 1.8 2000/07/09 17:52:45 gerd
2306 * New implementation for current_data.
2307 * The position of elements is stored on demand.
2309 * Revision 1.7 2000/07/09 01:00:35 gerd
2310 * Improvement: It is now guaranteed that only one data node
2311 * is added for consecutive character material.
2313 * Revision 1.6 2000/07/08 16:27:29 gerd
2314 * Cleaned up the functions calling the parser.
2315 * New parser argument: transform_dtd.
2316 * Implementations for 'extract_dtd_from_document_entity' and
2317 * 'parse_wfcontent_entity'.
2319 * Revision 1.5 2000/07/06 23:05:18 gerd
2320 * Initializations of resolvers were missing.
2322 * Revision 1.4 2000/07/06 22:11:01 gerd
2323 * Fix: The creation of the non-virtual root element is protected
2324 * in the same way as the virtual root element.
2326 * Revision 1.3 2000/07/04 22:15:18 gerd
2327 * Change: Using the new resolver capabilities.
2328 * Still incomplete: the new extraction and parsing functions.
2330 * Revision 1.2 2000/06/14 22:19:06 gerd
2331 * Added checks such that it is impossible to mix encodings.
2333 * Revision 1.1 2000/05/29 23:48:38 gerd
2334 * Changed module names:
2335 * Markup_aux into Pxp_aux
2336 * Markup_codewriter into Pxp_codewriter
2337 * Markup_document into Pxp_document
2338 * Markup_dtd into Pxp_dtd
2339 * Markup_entity into Pxp_entity
2340 * Markup_lexer_types into Pxp_lexer_types
2341 * Markup_reader into Pxp_reader
2342 * Markup_types into Pxp_types
2343 * Markup_yacc into Pxp_yacc
2344 * See directory "compatibility" for (almost) compatible wrappers emulating
2345 * Markup_document, Markup_dtd, Markup_reader, Markup_types, and Markup_yacc.
2347 * ======================================================================
2348 * Old logs from markup_yacc.m2y:
2350 * Revision 1.9 2000/05/29 21:14:57 gerd
2351 * Changed the type 'encoding' into a polymorphic variant.
2353 * Revision 1.8 2000/05/27 19:26:19 gerd
2354 * Change: The XML declaration is interpreted right after
2355 * it has been parsed (no longer after the document): new function
2356 * check_and_parse_xmldecl.
2357 * When elements, attributes, and entities are declared
2358 * it is stored whether the declaration happens in an external
2359 * entity (for the standalone check).
2360 * The option recognize_standalone_declaration is interpreted.
2362 * Revision 1.7 2000/05/20 20:31:40 gerd
2363 * Big change: Added support for various encodings of the
2364 * internal representation.
2366 * Revision 1.6 2000/05/14 21:51:24 gerd
2367 * Change: Whitespace is handled by the grammar, and no longer
2370 * Revision 1.5 2000/05/14 17:50:54 gerd
2371 * Updates because of changes in the token type.
2373 * Revision 1.4 2000/05/11 22:09:17 gerd
2374 * Fixed the remaining problems with conditional sections.
2375 * This seems to be also a weakness of the XML spec!
2377 * Revision 1.3 2000/05/09 00:02:44 gerd
2378 * Conditional sections are now recognized by the parser.
2379 * There seem some open questions; see the TODO comments!
2381 * Revision 1.2 2000/05/08 22:01:44 gerd
2382 * Introduced entity managers (see markup_entity.ml).
2383 * The XML declaration is now recognized by the parser. If such
2384 * a declaration is found, the method process_xmldecl of the currently
2385 * active entity is called. If the first token is not an XML declaration,
2386 * the method process_missing_xmldecl is called instead.
2387 * Some minor changes.
2389 * Revision 1.1 2000/05/06 23:21:49 gerd
2393 * ======================================================================
2395 * COPIED FROM REVISION 1.19 OF markup_yacc.mly
2397 * Revision 1.19 2000/05/01 15:20:08 gerd
2398 * "End tag matches start tag" is checked before "End tag in the
2399 * same entity as start tag".
2401 * Revision 1.18 2000/04/30 18:23:08 gerd
2402 * Bigger change: Introduced the concept of virtual roots. First,
2403 * this reduces the number of checks. Second, it makes it possible to
2404 * return the virtual root to the caller instead of the real root (new
2405 * config options 'virtual_root' and 'processing_instructions_inline').
2406 * Minor changes because of better CR/CRLF handling.
2408 * Revision 1.17 2000/03/13 23:47:46 gerd
2409 * Updated because of interface changes. (See markup_yacc_shadow.mli
2412 * Revision 1.16 2000/01/20 20:54:43 gerd
2413 * New config.errors_with_line_numbers.
2415 * Revision 1.15 1999/12/17 22:27:58 gerd
2416 * Bugfix: The value of 'p_internal_subset' (an instance
2417 * variable of the parser object) is to true when the internal subset
2418 * begins, and is set to false when this subset ends. The error was
2419 * that references to external entities within this subset did not
2420 * set 'p_internal_subset' to false; this is now corrected by introducing
2421 * the 'p_internal_subset_stack'.
2422 * This is a typical example of how the code gets more and
2423 * more complicated and that it is very difficult to really understand
2426 * Revision 1.14 1999/11/09 22:23:37 gerd
2427 * Removed the invocation of "init_dtd" of the root document.
2428 * This method is no longer available. The DTD is also passed to the
2429 * document object by the root element, so nothing essential changes.
2431 * Revision 1.13 1999/10/25 23:37:09 gerd
2432 * Bugfix: The warning "More than one ATTLIST declaration for element
2433 * type ..." is only generated if an ATTLIST is found while there are already
2434 * attributes for the element.
2436 * Revision 1.12 1999/09/01 23:08:38 gerd
2437 * New frontend function: parse_wf_document. This simply uses
2438 * a DTD that allows anything, and by the new parameter "extend_dtd" it is
2439 * avoided that element, attlist, and notation declarations are added to this
2440 * DTD. The idea is that this function simulates a well-formedness parser.
2441 * Tag_beg, Tag_end carry the entity_id. The "elstack" stores the
2442 * entity_id of the stacked tag. This was necessary because otherwise there
2443 * are some examples to produces incorrectly nested elements.
2444 * p_internal_subset is a variable that stores whether the internal
2445 * subset is being parsed. This is important beacause entity declarations in
2446 * internal subsets are not allowed to contain parameter references.
2447 * It is checked if the "elstack" is empty after all has been parsed.
2448 * Processing instructions outside DTDs and outside elements are now
2449 * added to the document.
2450 * The rules of mixed and regexp style content models have been
2451 * separated. The code is now much simpler.
2452 * Entity references outside elements are detected and rejected.
2454 * Revision 1.11 1999/09/01 16:26:08 gerd
2455 * Improved the quality of error messages.
2457 * Revision 1.10 1999/08/31 19:13:31 gerd
2458 * Added checks on proper PE nesting. The idea is that tokens such
2459 * as Decl_element and Decl_rangle carry an entity ID with them. This ID
2460 * is simply an object of type < >, i.e. you can only test on identity.
2461 * The lexer always produces tokens with a dummy ID because it does not
2462 * know which entity is the current one. The entity layer replaces the dummy
2463 * ID with the actual ID. The parser checks that the IDs of pairs such as
2464 * Decl_element and Decl_rangle are the same; otherwise a Validation_error
2467 * Revision 1.9 1999/08/15 20:42:01 gerd
2468 * Corrected a misleading message.
2470 * Revision 1.8 1999/08/15 20:37:34 gerd
2471 * Improved error messages.
2472 * Bugfix: While parsing document entities, the subclass document_entity is
2473 * now used instead of external_entity. The rules in document entities are a bit
2476 * Revision 1.7 1999/08/15 14:03:59 gerd
2477 * Empty documents are not allowed.
2478 * "CDATA section not allowed here" is a WF_error, not a Validation_
2481 * Revision 1.6 1999/08/15 02:24:19 gerd
2482 * Removed some grammar rules that were used for testing.
2483 * Documents without DTD can now have arbitrary elements (formerly
2484 * they were not allowed to have any element).
2486 * Revision 1.5 1999/08/14 22:57:20 gerd
2487 * It is allowed that external entities are empty because the
2488 * empty string is well-parsed for both declarations and contents. Empty
2489 * entities can be referenced anywhere because the references are replaced
2490 * by nothing. Because of this, the Begin_entity...End_entity brace is only
2491 * inserted if the entity is non-empty. (Otherwise references to empty
2492 * entities would not be allowed anywhere.)
2493 * As a consequence, the grammar has been changed such that a
2494 * single Eof is equivalent to Begin_entity,End_entity without content.
2496 * Revision 1.4 1999/08/14 22:20:01 gerd
2497 * The "config" slot has now a component "warner" which is
2498 * an object with a "warn" method. This is used to warn about characters
2499 * that cannot be represented in the Latin 1 alphabet.
2500 * Furthermore, there is a new component "debugging_mode".
2501 * Some Parse_error exceptions have been changed into Validation_error.
2502 * The interfaces of functions/classes imported from other modules
2503 * have changed; the invocations have been adapted.
2504 * Contents may contain CDATA sections that have been forgotten.
2506 * Revision 1.3 1999/08/11 15:00:41 gerd
2507 * The Begin_entity ... End_entity brace is also possible in
2509 * The configuration passed to the parsing object contains always
2510 * the resolver that is actually used.
2512 * Revision 1.2 1999/08/10 21:35:12 gerd
2513 * The XML/encoding declaration at the beginning of entities is
2514 * evaluated. In particular, entities have now a method "xml_declaration"
2515 * which returns the name/value pairs of such a declaration. The "encoding"
2516 * setting is interpreted by the entity itself; "version", and "standalone"
2517 * are interpreted by Markup_yacc.parse_document_entity. Other settings
2518 * are ignored (this does not conform to the standard; the standard prescribes
2519 * that "version" MUST be given in the declaration of document; "standalone"
2520 * and "encoding" CAN be declared; no other settings are allowed).
2521 * TODO: The user should be warned if the standard is not exactly
2522 * fulfilled. -- The "standalone" property is not checked yet.
2524 * Revision 1.1 1999/08/10 00:35:52 gerd