(* $Id$ -*- tuareg -*- * ---------------------------------------------------------------------- * PXP: The polymorphic XML parser for Objective Caml. * Copyright by Gerd Stolpmann. See LICENSE for details. *) open Parsing open Pxp_types open Pxp_lexer_types open Pxp_dtd open Pxp_entity open Pxp_document open Pxp_aux (* Some types from the interface definition: *) exception ID_not_unique class type [ 'ext ] index = object constraint 'ext = 'ext node #extension method add : string -> 'ext node -> unit method find : string -> 'ext node end type config = { warner : collect_warnings; errors_with_line_numbers : bool; enable_pinstr_nodes : bool; enable_super_root_node : bool; enable_comment_nodes : bool; encoding : rep_encoding; recognize_standalone_declaration : bool; store_element_positions : bool; idref_pass : bool; validate_by_dfa : bool; accept_only_deterministic_models : bool; debugging_mode : bool; } type source = Entity of ((dtd -> Pxp_entity.entity) * Pxp_reader.resolver) | ExtID of (ext_id * Pxp_reader.resolver) type start_symbol = Ext_document | Ext_declarations | Ext_element type context = { mutable current : unit -> token; (* get the current token *) mutable get_next : unit -> token; (* go on to the next token; return it *) mutable current_token : token; (* This is the current token *) mutable manager : entity_manager; (* The entity manager *) } let make_context entity_manager = let c = { current = (fun _ -> assert false); get_next = (fun _ -> assert false); current_token = Eof; manager = entity_manager; } in (* Note that the function which is stored in get_next_ref can be changed * as a side-effect when an entity is opened or closed. The function in * c.get_next must be programmed such that always the current "get_next" * function is executed. *) let get_next_ref = entity_manager # yy_get_next_ref in c.current <- (fun () -> c.current_token); c.get_next <- (fun () -> let tok = !get_next_ref() in c.current_token <- tok; tok); ignore(c.get_next()); c ;; let from_channel ?system_encoding ?id:init_id ?fixenc ch = (* Reading from a channel works by modifying the algorithm of * resolve_as_file. *) let url_syntax = (* A syntax suitable for "file" URLs *) { Neturl.null_url_syntax with Neturl.url_enable_scheme = Neturl.Url_part_allowed; Neturl.url_enable_host = Neturl.Url_part_allowed; Neturl.url_enable_path = Neturl.Url_part_required; Neturl.url_accepts_8bits = true; } in let an_url = Neturl.make_url ~scheme: "file" ~host: "" ~path: [ "" ] url_syntax in let init_channel_done = ref false in (* Whether the first access to this source has already happened. *) (* The task of url_of_id is: * - When it is called the first time, and no init_id is present, * the URL file:/// is passed back (an_url). This forces that * absolute path names /path/dir/... will be interpreted as * file path names. (But relative path names will not work.) * - If an init_id has been passed, we can assume that the opened URL * is exactly this init_id. By raising Not_competent it is indicated * that the standard method is to be used for the interpretation of * the URL. * - Otherwise, the channel is already being read, and thus cannot again * opened. (This case is handled in channel_of_url.) *) let url_of_id xid = if !init_channel_done then begin (* Use the normal way of determining the URL of the ID: *) raise Pxp_reader.Not_competent end else begin match init_id with None -> an_url (* If the channel is not associated with any URL: Simply pass * the URL file:/// back. *) | Some the_init_id -> assert (the_init_id = xid); raise Pxp_reader.Not_competent (* If the channel is associated with a URL, the corresponding * ID must be passed when the first invocation happens. *) end in (* The task of channel_of_url: * - If it is called the first time ("else"), the channel is returned * - Otherwise, the channel is already being read, and thus cannot again * opened. By raising Not_competent it is signaled that the * resolve_as_file object must not continue to open the URL. *) let channel_of_url url = if !init_channel_done then raise Pxp_reader.Not_competent else begin init_channel_done := true; ch, fixenc end in let r = new Pxp_reader.resolve_as_file ?system_encoding:system_encoding ~url_of_id:url_of_id ~channel_of_url:channel_of_url () in let init_xid = match init_id with None -> Anonymous | Some id -> (* Note: 'id' may be illegal (malformed); in this case, the first * invocation of url_of_id will raise Not_competent, and the 'open_in' * method will fail. *) id in ExtID(init_xid, r) ;; let from_file ?system_encoding utf8_filename = let r = new Pxp_reader.resolve_as_file ?system_encoding:system_encoding () in let utf8_abs_filename = if utf8_filename <> "" && utf8_filename.[0] = '/' then utf8_filename else Sys.getcwd() ^ "/" ^ utf8_filename in let syntax = { Neturl.ip_url_syntax with Neturl.url_accepts_8bits = true } in let url = Neturl.make_url ~scheme:"file" ~host:"localhost" ~path:(Neturl.split_path utf8_abs_filename) syntax in let xid = System (Neturl.string_of_url url) in ExtID(xid, r) ;; let from_string ?fixenc s = let r = new Pxp_reader.resolve_read_this_string ?fixenc:fixenc s in ExtID(Anonymous, r) ;; (**********************************************************************) class ['ext] parser_object init_doc init_dtd init_extend_dtd init_config init_resolver init_spec init_process_xmldecl transform_dtd id_index = object (self) (* Note that the 'ext parameter has been the motivation to make the * parser a class. *) val mutable dtd = init_dtd (* The DTD being parsed; or the DTD currently assumed *) val extend_dtd = init_extend_dtd (* Whether the DTD should be extended by ELEMENT, ATTLIST, and * NOTATION declarations or not. (True for validating mode, * false for well-formedness mode.) *) val transform_dtd = transform_dtd (* A function transforming the DTD *) val id_index = (id_index : 'ext index option) (* The ID index or None *) val process_xmldecl = init_process_xmldecl (* Whether the XML declaration is parsed and the found XML version * and standalone declaration are passed to 'doc'. *) val lexerset = Pxp_lexers.get_lexer_set (init_config.encoding) val doc = init_doc (* The current document *) method doc = (doc : 'ext document) val resolver = init_resolver (* The resolver for external IDs *) val config = init_config (* The current configuration *) val elstack = (Stack.create() : ('ext node * entity_id) Stack.t) (* The element stack containing all open elements, i.e. elements that * have begun by a start tag but that have not been finished (end tag). * If the parser sees a start tag, it creates the element and pushes it * on top of this stack. If the parser recognizes an end tag, it pulls * one element from the stack and checks if it has the same name as * given with the end tag. * * At initialization time, a special element is pushed on the stack, * the so-called super root. It is always the bottommost * element of the stack, and serves as a guard. * [See "initializer" below.] *) method current = (* Get the top element of the element stack *) try fst(Stack.top elstack) with Stack.Empty -> assert false (* Not possible, because the super root is always the element * at the bottom of the stack. *) val mutable n_tags_open = 0 (* Number of begin tags that have been parsed and whose corresponding * end tags have not yet been parsed *) val mutable p_internal_subset = false (* true while parsing the internal subset - there are some additional * constraints for internal subsets, and because of this it must * be known whether the current declaration is contained in the * internal or external subset of the DTD. *) val mutable root = None (* Contains the root element (topmost element) while it is being parsed * and after it has been parsed. * This variable is None before the root element is seen. *) method root = root val spec = init_spec (* A hashtable that contains exemplar objects for the various element * types. If an element is parsed, the exemplar is looked up and * "cloned" (by the "create" method) *) val mutable current_data = [] (* Collects character data. *) method collect_data s = (* Collects the character material 's' *) current_data <- s :: current_data method save_data = (* Puts the material collected in 'current_data' into a new * node, and appends this node as new sub node to 'current' *) match current_data with [] -> () | [ str ] -> if str <> "" then self # current # add_node (create_data_node spec dtd str); current_data <- [] | _ -> let count = List.fold_left (fun acc s -> acc + String.length s) 0 current_data in let str = String.create count in let pos = ref count in List.iter (fun s -> let l = String.length s in pos := !pos - l; String.blit ~src:s ~src_pos:0 ~dst:str ~dst_pos:(!pos) ~len:l ) current_data; assert(!pos = 0); if str <> "" then self # current # add_node (create_data_node spec dtd str); current_data <- [] method only_whitespace data = (* Checks that the string "data" contains only whitespace. On failure, * Validation_error is raised. *) let lexbuf = Lexing.from_string data in let t1 = lexerset.scan_name_string lexbuf in if t1 <> Ignore then raise(WF_error("Data not allowed here")); let t2 = lexerset.scan_name_string lexbuf in if t2 <> Eof then raise(WF_error("Data not allowed here")); () initializer (* CHECKS: *) if config.encoding <> dtd # encoding then failwith("Encoding mismatch"); (* --- Initialize 'elstack': Push the super-root on the stack. *) let super_root = if config.enable_super_root_node then create_super_root_node spec dtd else (* because spec may not contain an exemplar for the super root: *) create_no_node spec dtd in (* Move the super root or the emulation to the stack: *) Stack.push (super_root, (self :> entity_id)) elstack; (********* Here the method "parse" begins. The grammar below is * transformed to a local function of this method *) method parse context start_symbol = let parse_ignored_section yy_current yy_get_next = (* A special parser which should be used after is found. *) while yy_current() = Ignore do ignore(yy_get_next()); done; ( match yy_current() with Conditional_body _ -> () | _ -> raise Parsing.Parse_error; ); let en = context.manager # current_entity in let llev = ref 1 in while !llev >= 1 do let igntok = en # next_ignored_token in (* next_ignored_token: uses a special lexer that only * recognizes Conditional_begin and Conditional_end; * other character combinations are ignored. *) (* NOTE: next_ignored_token works much like yy_get_next, * but it does not set the current token! *) match igntok with Conditional_begin _ -> llev := !llev + 1 | Conditional_end _ -> llev := !llev - 1; (* Because the loop may be exited now: *) context.current_token <- igntok; | (End_entity | Eof) -> raise Parsing.Parse_error | _ -> () done; in let check_and_parse_xmldecl xmldecl = if process_xmldecl then begin let v, _, s = decode_doc_xml_pi (decode_xml_pi xmldecl) in check_version_num v; doc # init_xml_version v; let v = match s with None -> false | Some "yes" -> true | Some "no" -> false | _ -> raise (WF_error("Illegal 'standalone' declaration")) in if config.recognize_standalone_declaration then dtd # set_standalone_declaration v end in let recode_utf8 s = (* Recode 's' to UTF-8 *) if config.encoding = `Enc_utf8 then s (* No recoding necessary *) else Netconversion.recode_string ~in_enc:(config.encoding :> encoding) ~out_enc:`Enc_utf8 s in %% /* The following grammar looks similar to ocamlyacc grammars, but * ocamlyacc is actually not used to transform the grammar into a parser. * Instead, the parser generator m2parsergen is applied. * * The format of the grammar is different (see m2parsergen/README), * but I hope that you can understand most features immediately. * * The type of the parser is different: m2parsergen creates a top-down * parser while ocamlyacc generates a LALR-1 parser. * * The way the generated code is called is different: ocamlyacc produces * lots of top-level definitions whereas m2parsergen generates only * a local let-in-phrase. This is explained in the already mentioned * README file. */ /* See Pxp_types.ml for comments to the various tokens */ %token Begin_entity %token End_entity %token Comment_begin %token Comment_end %token Ignore %token Eq %token Rangle %token Rangle_empty %token <> Conditional_begin %token <> Conditional_body %token <> Conditional_end %token Percent %token Plus %token Star %token Bar %token Comma %token Qmark %token Pcdata %token Required %token Implied %token Fixed %token Eof %token <> Comment_material %token <> Doctype %token <> Doctype_rangle %token <> Dtd_begin %token <> Dtd_end %token <> Decl_element %token <> Decl_attlist %token <> Decl_entity %token <> Decl_notation %token <> Decl_rangle %token <> Lparen %token <> Rparen %token <> RparenPlus %token <> RparenStar %token <> RparenQmark %token <> Tag_beg %token <> Tag_end %token <> PI %token <> PI_xml %token <> Cdata %token <> CRef %token <> ERef %token <> PERef %token <> CharData %token <> LineEnd %token <> Name %token <> Nametoken %token <> Attval %token <> Attval_nl_normalized %token <> Unparsed_string /* START SYMBOLS: * * "ext_document": parses a complete XML document (i.e. containing a * and an element) * "ext_declarations": parses an "external DTD subset", i.e. a sequence * of declarations * "ext_element": parses a single element (no allowed); * the element needs not to be the root element of the * DTD * * The functions corresponding to these symbols return always () because * they only have side-effects. */ /* SOME GENERAL COMMENTS: * * The parser does not get its tokens from the lexers directly. Instead of * this, there is an entity object between the parser and the lexers. This * object already handles: * * - References to general and parameter entities. The token stream is * modified such that tokens automatically come from the referenced entities. * External parameter entities and all general entities are embraced by * the two special tokens Begin_entity and End_entity. The parser must * check that these braces are correctly nested. */ %% ext_document(): Begin_entity doc_xmldecl_then_misc_then_prolog_then_rest() End_entity {{ if n_tags_open <> 0 then raise(WF_error("Missing end tag")) }} /* In the following rule, we must find out whether there is an XML declaration * or not, and directly after that either "process_xmldecl" or * "process_missing_xmldecl" of the current entity must be called. * AND IT MUST BE DIRECTLY! Because of this, the invocation is carried out * in the "$" clause immediately following the first token. * * TODO: This is not enough. The first token may be a tag, and the tag * may already contain non-ASCII characters. (But in this case, the resolvers * assume UTF8, and they are right...) */ doc_xmldecl_then_misc_then_prolog_then_rest(): pl:PI_xml $ {{ context.manager # current_entity # process_xmldecl pl; check_and_parse_xmldecl pl; }} misc()* doc_prolog_then_rest() {{ () }} | $ {{ context.manager # current_entity # process_missing_xmldecl; }} misc() misc()* doc_prolog_then_rest() {{ () }} | $ {{ context.manager # current_entity # process_missing_xmldecl; }} doctypedecl() misc()* contents_start() {{ () }} | $ {{ context.manager # current_entity # process_missing_xmldecl; }} contents_start() {{ () }} doc_prolog_then_rest(): doctypedecl() misc()* contents_start() {{ () }} | contents_start() {{ () }} ext_element(): Begin_entity el_xmldecl_then_misc_then_rest() End_entity {{ if n_tags_open <> 0 then raise(WF_error("Missing end tag")) }} /* See comment for doc_mldecl_then_misc_then_prolog_then_rest. */ el_xmldecl_then_misc_then_rest(): pl:PI_xml $ {{ context.manager # current_entity # process_xmldecl pl; }} misc()* contents_start() {{ () }} | $ {{ context.manager # current_entity # process_missing_xmldecl; }} misc() misc()* contents_start() {{ () }} | $ {{ context.manager # current_entity # process_missing_xmldecl; }} contents_start() {{ () }} ext_declarations(): /* Parses a sequence of declarations given by an entity. As side-effect, * the parsed declarations are put into the dtd object. */ Begin_entity decl_xmldecl_then_rest() {{ () }} | Eof {{ () }} decl_xmldecl_then_rest(): /* Note: This rule is also called from declaration()! */ pl:PI_xml $ {{ context.manager # current_entity # process_xmldecl pl; }} declaration()* End_entity {{ () }} | $ {{ context.manager # current_entity # process_missing_xmldecl; }} declaration() declaration()* End_entity {{ () }} | $ {{ context.manager # current_entity # process_missing_xmldecl; }} End_entity {{ () }} misc(): pi() {{ () }} | data: CharData /* In this context, the lexers sometimes do not recognize white space; * instead CharData tokens containing white space are delivered. */ {{ self # only_whitespace data }} | Ignore {{ () }} | comment() {{ () }} /********************* DOCUMENT TYPE DECLARATION *************************/ doctypedecl(): /* parses from . As side-effect, first the declarations of * the internal DTD (if any) are put into !!on_dtd, then the declarations * of the external DTD (if any) are put into this DTD object. */ doctype_entid: Doctype ws: Ignore Ignore* doctypedecl_material (doctype_entid) {{ () }} ? {{ match !yy_position with "ws" -> raise(WF_error("Whitespace is missing after `DOCTYPE'")) | _ -> raise(WF_error("Bad DOCTYPE declaration")) }} /* TRICK: * ws: Ignore? Ignore* * is meant seriously. The effect is that ws becomes a boolean variable * which is true if there is an Ignore token and false otherwise. * This construct is faster than just * ws: Ignore* * in which case ws becomes an integer variable containing the number of * Ignore tokens. Counting the number of tokens is slower than only checking * the existence. * * We need the information whether there is an Ignore token (representing * white space), because white space is only obligatory if also an identifier * for the external subset is parsed; this conditional syntax constraint is * simply programmed in the body of the grammar rule. */ doctypedecl_material(doctype_entid): root_name: Name ws: Ignore? Ignore* external_subset: external_id()? Ignore* internal_subset: internal_dtd()? Ignore* doctype_rangle_entid: Doctype_rangle {{ if doctype_entid != doctype_rangle_entid then raise (Validation_error("Entities not properly nested with DOCTYPE declaration")); dtd # set_root root_name; begin match external_subset, internal_subset with None, None -> () (* no DTD means no ID *) | None, Some _ -> dtd # set_id Internal | Some id, None -> dtd # set_id (External id) | Some id, Some _ -> dtd # set_id (Derived id) end; (* Get now the external doctype declaration. Note that the internal * subset has precedence and must be read first. *) begin match external_subset with None -> () | Some id -> if not ws then raise(WF_error("Whitespace is missing after `DOCTYPE " ^ root_name ^ "'")); let r' = resolver # clone in let pobj = new parser_object (new document config.warner) dtd extend_dtd config r' spec process_xmldecl (fun x -> x) None in let en = new external_entity r' dtd "[dtd]" config.warner id false config.errors_with_line_numbers config.encoding in en # set_debugging_mode (config.debugging_mode); let mgr = new entity_manager en in en # open_entity true Declaration; try let context = make_context mgr in pobj # parse context Ext_declarations; ignore(en # close_entity); with error -> ignore(en # close_entity); r' # close_all; let pos = mgr # position_string in raise (At(pos, error)) end; dtd # validate }} ? {{ match !yy_position with "doctype_rangle_entid" -> raise(WF_error("`>' expected")) | _ -> raise(WF_error("Bad DOCTYPE declaration")) }} /* Note that there are no keywords for SYSTEM or PUBLIC, as these would * be difficult to recognize in the lexical contexts. Because of this, * SYSTEM/PUBLIC is parsed as name, and the rule for everything after * SYSTEM/PUBLIC is computed dynamically. */ external_id(): tok:Name $ {{ let followup = match tok with "SYSTEM" -> parse_system_id (* Apply the rule system_id (below) to parse the * rest of the ID *) | "PUBLIC" -> parse_public_id (* Apply the rule public_id (below) to parse the * rest of the ID *) | _ -> raise(WF_error("SYSTEM or PUBLIC expected")) in }} ws:Ignore Ignore* r:[followup]() {{ r }} ? {{ match !yy_position with "ws" -> raise(WF_error("Whitespace is missing after " ^ tok)) | _ -> raise(WF_error("Bad SYSTEM or PUBLIC identifier")) }} system_id(): str:Unparsed_string {{ System (recode_utf8 str) }} public_id(): str1: Unparsed_string ws: Ignore Ignore* str2: Unparsed_string {{ check_public_id str1; Public(recode_utf8 str1, recode_utf8 str2) }} ? {{ match !yy_position with "ws" -> raise(WF_error("Whitespace is missing between the literals of the PUBLIC identifier")) | _ -> raise(WF_error("Bad PUBLIC identifier")) }} /* The internal subset: "[" declaration* "]". While parsing the declarations * the object variable p_internal_subset must be true; however, if there * are entity references, this variable must be reset to false during * the entity. (See the rule for "declaration" below.) */ internal_dtd(): dtd_begin_entid: internal_dtd_begin() declaration()* dtd_end_entid: internal_dtd_end() {{ if dtd_begin_entid != dtd_end_entid then raise(Validation_error("Entities not properly nested with internal DTD subset")) }} ? {{ match !yy_position with "dtd_end_entid" -> raise(WF_error("`]' expected")) | _ -> raise(WF_error("Bad internal DTD subset")) }} internal_dtd_begin(): Dtd_begin {{ assert (not p_internal_subset); p_internal_subset <- true }} internal_dtd_end(): Dtd_end {{ assert p_internal_subset; p_internal_subset <- false }} declaration(): /* Parses a single declaration (or processing instruction). As side-effect * the parsed declaration is stored into the dtd object. */ elementdecl() {{ () }} | attlistdecl() {{ () }} | entid:Decl_entity ws:Ignore Ignore* e:entitydecl(entid) {{ () }} ? {{ match !yy_position with "ws" -> raise(WF_error("Whitespace is missing after ENTITY")) | "e" -> raise(WF_error("Name or `%' expected")) | _ -> raise(WF_error("Bad entity declaration")) }} | notationdecl() {{ () }} | pi: PI {{ let target, value = pi in let pi = new proc_instruction target value config.encoding in dtd # add_pinstr pi }} | Ignore {{ () }} | Comment_begin Comment_material* ce:Comment_end {{ () }} ? {{ match !yy_position with "ce" -> raise(WF_error("`-->' expected")) | _ -> raise(WF_error("Bad comment")) }} | Begin_entity $ {{ (* Set 'p_internal_subset' to 'false' until the matching 'end_entity' * rule is parsed. This allows unrestricted usage of parameter entities * within declarations of internal entities. *) let old_p_internal_subset = p_internal_subset in p_internal_subset <- false; }} decl_xmldecl_then_rest() {{ (* Restore the old value of 'p_internal_subset'. *) p_internal_subset <- old_p_internal_subset; () }} | begin_entid:Conditional_begin $ {{ (* Check whether conditional sections are allowed at this position. *) if p_internal_subset then raise(WF_error("Restriction of the internal subset: Conditional sections not allowed")); }} Ignore* cond:conditional_section() end_entid:Conditional_end {{ (* Check whether Conditional_begin and Conditional_end are in the same * entity. (This restriction is explained in the file SPECS.) *) if begin_entid != end_entid then raise(Validation_error("The first and the last token of conditional sections must be in the same entity (additional restriction of this parser)")); }} ? {{ match !yy_position with "end_entid" -> raise(WF_error("`>]>' expected")) | "cond" -> raise(WF_error("INCLUDE or IGNORE expected")) | _ -> raise(WF_error("Bad conditional section")) }} /* The tokens INCLUDE/IGNORE are scanned as names, and the selection of the * right parsing rule is dynamic. * Note that parse_ignored_section is not defined by a grammar rule but * by a conventional let-binding above. */ conditional_section(): include_or_ignore:Name $ {{ let parsing_function = match include_or_ignore with "INCLUDE" -> parse_included_section (* invoke rule "included_section" below *) | "IGNORE" -> parse_ignored_section (* invoke function "parse_ignored_section" *) | _ -> raise(WF_error("INCLUDE or IGNORE expected")) in }} [ parsing_function ] () {{ () }} ? {{ raise(WF_error("Bad conditional section")) }} included_section(): Conditional_body declaration()* {{ () }} | Ignore Ignore* Conditional_body declaration()* {{ () }} /*************************** ELEMENT DECLARATIONS ********************/ elementdecl(): /* parses . Puts the parsed element type as side-effect into * dtd. */ decl_element_entid: Decl_element $ {{ let extdecl = context.manager # current_entity_counts_as_external in }} ws1: Ignore Ignore* name: Name ws2: Ignore Ignore* content_model: contentspec() Ignore* decl_rangle_entid: Decl_rangle {{ if decl_element_entid != decl_rangle_entid then raise (Validation_error "Entities not properly nested with ELEMENT declaration"); if extend_dtd then begin let el = new dtd_element dtd name in (* It is allowed that an precedes the corresponding * . Because of this it is possible that there is already * an element called 'name' in the DTD, and we only must set the content * model of this element. *) try dtd # add_element el; el # set_cm_and_extdecl content_model extdecl; with Not_found -> (* means: there is already an element 'name' *) let el' = dtd # element name in el' # set_cm_and_extdecl content_model extdecl; (* raises Validation_error if el' already has a content model *) end }} ? {{ match !yy_position with ("ws1"|"ws2") -> raise(WF_error("Whitespace is missing")) | "name" -> raise(WF_error("The name of the element is expected here")) | "content_model" -> raise(WF_error("Content model expression expected")) | "decl_rangle_entid" -> raise(WF_error("`>' expected")) | _ -> raise(WF_error("Bad element type declaration")) }} contentspec(): /* parses a content model and returns it (type content_model_type) */ name: Name /* EMPTY or ANY */ {{ match name with "EMPTY" -> Empty | "ANY" -> Any | _ -> raise(WF_error("EMPTY, ANY, or a subexpression expected")) }} | entid:Lparen Ignore* term:mixed_or_regexp(entid) {{ term }} ? {{ raise(WF_error("Bad content model expression")) }} /* Many of the following rules have an lparen_entid argument. This is the * internal ID of the entity containing the corresponding left parenthesis; * by comparing it with the ID of the entity of the right parenthesis the * contraint is implemented that both parentheses must be in the same entity. */ mixed_or_regexp(lparen_entid): re: choice_or_seq(lparen_entid) {{ Regexp re }} | m: mixed(lparen_entid) {{ m }} multiplier(): /* returns one of the multiplier symbols (?,*,+) */ Plus {{ Plus }} | Star {{ Star }} | Qmark {{ Qmark }} mixed (lparen_entid) : Pcdata Ignore* material: mixed_alternatives_top() {{ let rest, rparen_entid = material in if lparen_entid != rparen_entid then raise (Validation_error "Entities not properly nested with parentheses"); Mixed (MPCDATA :: rest) }} ? {{ raise(WF_error("Bad content model expression")) }} mixed_alternatives_top(): entid: Rparen {{ [], entid }} | entid: RparenStar {{ [], entid }} | Bar Ignore* name:Name Ignore* names:mixed_alternative()* entid:RparenStar {{ (MChild name :: names), entid }} ? {{ match !yy_position with "name" -> raise(WF_error("Name expected")) | "entid" -> raise(WF_error("`)*' expected")) | _ -> raise(WF_error("Bad content model expression")) }} mixed_alternative() : Bar Ignore* name:Name Ignore* {{ MChild name }} ? {{ match !yy_position with "name" -> raise(WF_error("Name expected")) | _ -> raise(WF_error("Bad content model expression")) }} choice_or_seq (lparen_entid): /* parses either a regular expression, or a mixed expression. Returns * Mixed spec or Regexp spec (content_model_type). * Which kind of expression (regexp or mixed) is being read is recognized * after the first subexpression has been parsed; the other subexpressions * must be of the same kind. */ re: cp() Ignore* factor: choice_or_seq_factor() {{ let (finalmark,subexpr), rparen_entid = factor in if lparen_entid != rparen_entid then raise (Validation_error "Entities not properly nested with parentheses"); (* Check that the other subexpressions are "regexp", too, and * merge them with the first. *) let re' = match subexpr with Alt [] -> re | Alt alt -> Alt (re :: alt) | Seq seq -> Seq (re :: seq) | _ -> assert false in (* Interpret the finalmark. *) match finalmark with Ignore -> re' | Plus -> Repeated1 re' | Star -> Repeated re' | Qmark -> Optional re' | _ -> assert false }} ? {{ raise(WF_error("Bad content model expression")) }} choice_or_seq_factor(): /* Parses "||...)" or ",,...)", both forms optionally * followed by ?, *, or +. * Returns ((finalmark, expr), rparen_entid), where * - finalmark is the character after the right parenthesis or Ignore * - expr is either * Alt [] meaning that only ")" has been found * Alt non_empty_list meaning that the subexpressions are separated by '|' * Seq non_empty_list meaning that the subexpressions are separated by ',' */ entid:Rparen {{ (Ignore, Alt []), entid }} | entid:RparenPlus {{ (Plus, Alt []), entid }} | entid:RparenStar {{ (Star, Alt []), entid }} | entid:RparenQmark {{ (Qmark, Alt []), entid }} | Bar Ignore* re:cp() Ignore* factor:choice_or_seq_factor() {{ let (finalmark, subexpr), rparen_entid = factor in begin match subexpr with Alt [] -> (finalmark, (Alt [re])), rparen_entid | Alt alt -> (finalmark, (Alt (re :: alt))), rparen_entid | _ -> raise(WF_error("It is not allowed to mix alternatives and sequences")) end }} ? {{ raise(WF_error("Bad content model expression")) }} | Comma Ignore* re:cp() Ignore* factor:choice_or_seq_factor() {{ let (finalmark, subexpr), rparen_entid = factor in begin match subexpr with Alt [] -> (finalmark, (Seq [re])), rparen_entid | Seq seq -> (finalmark, (Seq (re :: seq))), rparen_entid | _ -> raise(WF_error("It is not allowed to mix alternatives and sequences")) end }} ? {{ raise(WF_error("Bad content model expression")) }} cp(): /* parse either a name, or a parenthesized subexpression "(...)" */ name:Name m:multiplier()? {{ match m with None -> Child name | Some Plus -> Repeated1 (Child name) | Some Star -> Repeated (Child name) | Some Qmark -> Optional (Child name) | _ -> assert false }} ? {{ raise(WF_error("Bad content model expression")) }} | entid:Lparen Ignore* m:choice_or_seq(entid) {{ m }} ? {{ raise(WF_error("Bad content model expression")) }} /********************* ATTRIBUTE LIST DECLARATION ***********************/ attlistdecl(): /* parses . Enters the attribute list in dtd as side- * effect. */ decl_attlist_entid: Decl_attlist $ {{ let extdecl = context.manager # current_entity_counts_as_external in }} ws1: Ignore Ignore* el_name: Name ws: Ignore? Ignore* factor: attdef_factor() {{ let at_list, decl_rangle_entid = factor in if decl_attlist_entid != decl_rangle_entid then raise (Validation_error "Entities not properly nested with ATTLIST declaration"); if not ws && at_list <> [] then begin match at_list with (name,_,_) :: _ -> (* This is normally impossible, because the lexer demands * some other token between two names. *) raise(WF_error("Whitespace is missing before `" ^ name ^ "'")); | _ -> assert false end; if extend_dtd then begin let new_el = new dtd_element dtd el_name in (* Note that it is allowed that precedes the corresponding * declaration. In this case we add the element declaration * already to the DTD but leave the content model unspecified. *) let el = try dtd # add_element new_el; new_el with Not_found -> (* already added *) let old_el = dtd # element el_name in if old_el # attribute_names <> [] then config.warner # warn ("More than one ATTLIST declaration for element type `" ^ el_name ^ "'"); old_el in List.iter (fun (a_name, a_type, a_default) -> el # add_attribute a_name a_type a_default extdecl) at_list end }} ? {{ match !yy_position with "ws1" -> raise(WF_error("Whitespace is missing after ATTLIST")) | "el_name" -> raise(WF_error("The name of the element is expected here")) | "factor" -> raise(WF_error("Another attribute name or `>' expected")) | _ -> raise(WF_error("Bad attribute declaration")) }} attdef_factor(): /* parses a list of triples and returns the * list as (string * att_type * att_default) list. */ attdef:attdef() ws:Ignore? Ignore* factor:attdef_factor() {{ let attdef_rest, decl_rangle_entid = factor in if not ws && attdef_rest <> [] then begin match attdef_rest with (name,_,_) :: _ -> raise(WF_error("Missing whitespace before `" ^ name ^ "'")); | _ -> assert false end; (attdef :: attdef_rest), decl_rangle_entid }} ? {{ match !yy_position with | "factor" -> raise(WF_error("Another attribute name or `>' expected")) | _ -> raise(WF_error("Bad attribute declaration")) }} | entid:Decl_rangle {{ [], entid }} attdef(): /* Parses a single triple */ name: Name ws1: Ignore Ignore* tp: atttype() ws2: Ignore Ignore* default: defaultdecl() {{ (name,tp,default) }} ? {{ match !yy_position with ("ws1"|"ws2") -> raise(WF_error("Whitespace is missing")) | "tp" -> raise(WF_error("Type of attribute or `(' expected")) | "default" -> raise(WF_error("#REQUIRED, #IMPLIED, #FIXED or a string literal expected")) | _ -> raise(WF_error("Bad attribute declaration")) }} atttype(): /* Parses an attribute type and returns it as att_type. */ name: Name $ {{ let followup = if name = "NOTATION" then parse_notation else parse_never in }} nota: [followup]()? {{ match name with "CDATA" -> A_cdata | "ID" -> A_id | "IDREF" -> A_idref | "IDREFS" -> A_idrefs | "ENTITY" -> A_entity | "ENTITIES" -> A_entities | "NMTOKEN" -> A_nmtoken | "NMTOKENS" -> A_nmtokens | "NOTATION" -> (match nota with None -> raise(WF_error("Error in NOTATION type (perhaps missing whitespace after NOTATION?)")) | Some n -> n ) | _ -> raise(WF_error("One of CDATA, ID, IDREF, IDREFS, ENTITY, ENTITIES, NMTOKEN, NMTOKENS, NOTATION, or a subexpression expected")) }} ? {{ raise(WF_error("Bad attribute declaration (perhaps missing whitespace after NOTATION)")) }} | Lparen Ignore* name: name_or_nametoken() Ignore* names: nmtoken_factor()* rp: Rparen /* Enumeration */ {{ A_enum(name :: names) }} ? {{ match !yy_position with "name" -> raise(WF_error("Name expected")) | "names" -> raise(WF_error("`|' and more names expected, or `)'")) | "rp" -> raise(WF_error("`|' and more names expected, or `)'")) | _ -> raise(WF_error("Bad enumeration type")) }} never(): /* The always failing rule */ $ {{ raise Not_found; }} Doctype /* questionable */ {{ A_cdata (* Does not matter *) }} notation(): Ignore Ignore* lp: Lparen Ignore* name: Name Ignore* names: notation_factor()* rp: Rparen {{ A_notation(name :: names) }} ? {{ match !yy_position with "lp" -> raise(WF_error("`(' expected")) | "name" -> raise(WF_error("Name expected")) | "names" -> raise(WF_error("`|' and more names expected, or `)'")) | "rp" -> raise(WF_error("`|' and more names expected, or `)'")) | _ -> raise(WF_error("Bad NOTATION type")) }} notation_factor(): /* Parse "|" and return the name */ Bar Ignore* name:Name Ignore* {{ name }} ? {{ match !yy_position with "name" -> raise(WF_error("Name expected")) | _ -> raise(WF_error("Bad NOTATION type")) }} nmtoken_factor(): /* Parse "|" and return the nmtoken */ Bar Ignore* n:name_or_nametoken() Ignore* {{ n }} ? {{ match !yy_position with "n" -> raise(WF_error("Nametoken expected")) | _ -> raise(WF_error("Bad enumeration type")) }} name_or_nametoken(): n:Name {{ n }} | n:Nametoken {{ n }} /* The default values must be expanded and normalized. This has been implemented * by the function expand_attvalue. */ defaultdecl(): /* Parse the default value for an attribute and return it as att_default */ Required {{ D_required }} | Implied {{ D_implied }} | Fixed ws:Ignore Ignore* str:Unparsed_string {{ D_fixed (expand_attvalue lexerset dtd str config.warner false) }} ? {{ match !yy_position with "ws" -> raise(WF_error("Whitespace is missing after #FIXED")) | "str" -> raise(WF_error("String literal expected")) | _ -> raise(WF_error("Bad #FIXED default value")) }} | str:Unparsed_string {{ D_default (expand_attvalue lexerset dtd str config.warner false) }} /**************************** ENTITY DECLARATION ***********************/ entitydecl(decl_entity_entid): /* parses everything _after_ . The parsed * entity declaration is entered into the dtd object as side-effect. */ name: Name $ {{ let extdecl = context.manager # current_entity_counts_as_external in }} ws: Ignore Ignore* material: entitydef() Ignore* decl_rangle_entid: Decl_rangle /* A general entity */ {{ if decl_entity_entid != decl_rangle_entid then raise (Validation_error "Entities not properly nested with ENTITY declaration"); let en = (* Distinguish between * - internal entities * - external entities * - NDATA (unparsed) entities *) match material with (Some s, None, None) -> new internal_entity dtd name config.warner s p_internal_subset config.errors_with_line_numbers false config.encoding | (None, Some xid, None) -> new external_entity (resolver # clone) dtd name config.warner xid false config.errors_with_line_numbers config.encoding | (None, Some xid, Some n) -> (new ndata_entity name xid n config.encoding :> entity) | _ -> assert false in dtd # add_gen_entity en extdecl }} ? {{ match !yy_position with "ws" -> raise(WF_error("Whitespace is missing")) | "material" -> raise(WF_error("String literal or identifier expected")) | "decl_rangle_entid" -> raise(WF_error("`>' expected")) | _ -> raise(WF_error("Bad entity declaration")) }} | Percent $ {{ let extdecl = context.manager # current_entity_counts_as_external in }} ws1: Ignore Ignore* name: Name ws2: Ignore Ignore* material: pedef() Ignore* decl_rangle_entid: Decl_rangle /* A parameter entity */ {{ if decl_entity_entid != decl_rangle_entid then raise (Validation_error "Entities not properly nested with ENTITY declaration"); let en = (* Distinguish between internal and external entities *) match material with (Some s, None) -> new internal_entity dtd name config.warner s p_internal_subset config.errors_with_line_numbers true config.encoding | (None, Some xid) -> new external_entity (resolver # clone) dtd name config.warner xid true config.errors_with_line_numbers config.encoding | _ -> assert false in (* The following two lines force that even internal entities count * as external (for the standalone check) if the declaration of * the internal entity occurs in an external entity. *) if extdecl then en # set_counts_as_external; dtd # add_par_entity en; }} ? {{ match !yy_position with ("ws1"|"ws2") -> raise(WF_error("Whitespace is missing")) | "material" -> raise(WF_error("String literal or identifier expected")) | "decl_rangle_entid" -> raise(WF_error("`>' expected")) | _ -> raise(WF_error("Bad entity declaration")) }} entitydef(): /* parses the definition value of a general entity. Returns either: * - (Some s, None, None) meaning the definition of an internal entity * with (literal) value s has been found * - (None, Some x, None) meaning that an external parsed entity with * external ID x has been found * - (None, Some x, Some n) meaning that an unparsed entity with * external ID x and notations n has been found */ str:Unparsed_string {{ Some str, None, None }} | id:external_id() ws:Ignore? Ignore* decl:ndatadecl()? {{ if not ws && decl <> None then raise(WF_error("Whitespace missing before `NDATA'")); None, Some id, decl }} pedef(): /* parses the definition value of a parameter entity. Returns either: * - (Some s, None) meaning that the definition of an internal entity * with (literal) value s has been found * - (None, Some x) meaning that an external ID x has been found */ str:Unparsed_string {{ Some str, None }} | id:external_id() {{ None, Some id }} ndatadecl(): /* Parses either NDATA "string" or the empty string; returns Some "string" * in the former, None in the latter case. */ ndata:Name ws:Ignore Ignore* name:Name {{ if ndata = "NDATA" then name else raise(WF_error("NDATA expected")) }} ? {{ match !yy_position with "ws" -> raise(WF_error("Whitespace is missing after NDATA")) | "name" -> raise(WF_error("Name expected")) | _ -> raise(WF_error("Bad NDATA declaration")) }} /**************************** NOTATION DECLARATION *******************/ notationdecl(): /* parses and enters the notation declaration into the * dtd object as side-effect */ decl_notation_entid: Decl_notation ws1: Ignore Ignore* name: Name ws2: Ignore Ignore* sys_or_public: Name /* SYSTEM or PUBLIC */ ws3: Ignore Ignore* str1: Unparsed_string ws: Ignore? Ignore* str2: Unparsed_string? Ignore* decl_rangle_entid: Decl_rangle {{ if decl_notation_entid != decl_rangle_entid then raise (Validation_error "Entities not properly nested with NOTATION declaration"); let xid = (* Note that it is allowed that PUBLIC is only followed by one * string literal *) match sys_or_public with "SYSTEM" -> if str2 <> None then raise(WF_error("SYSTEM must be followed only by one argument")); System (recode_utf8 str1) | "PUBLIC" -> begin match str2 with None -> check_public_id str1; Public(recode_utf8 str1,"") | Some p -> if not ws then raise(WF_error("Missing whitespace between the string literals of the `PUBLIC' id")); check_public_id str1; Public(recode_utf8 str1, recode_utf8 p) end | _ -> raise(WF_error("PUBLIC or SYSTEM expected")) in if extend_dtd then begin let no = new dtd_notation name xid config.encoding in dtd # add_notation no end }} ? {{ match !yy_position with ("ws1"|"ws2"|"ws3") -> raise(WF_error("Whitespace is missing")) | "name" -> raise(WF_error("Name expected")) | "sys_or_public" -> raise(WF_error("SYSTEM or PUBLIC expected")) | ("str1"|"str2") -> raise(WF_error("String literal expected")) | "decl_rangle_entid" -> raise(WF_error("`>' expected")) | _ -> raise(WF_error("Bad NOTATION declaration")) }} /****************************** ELEMENTS **************************/ /* In the following rules, the number of error rules is reduced to * improve the performance of the parser. */ contents_start(): /* parses ... misc*, i.e. exactly one element followed * optionally by white space or processing instructions. * The element is entered into the global variables as follows: * - If elstack is non-empty, the parsed element is added as new child to * the top element of the stack. * - If elstack is empty, the root_examplar object is modified rather than * that a new element is created. If additionally the variable root is * None, it is assigned Some root_examplar. * Note that the modification of the root_exemplar is done by the method * internal_init. * The reason why the root element is modified rather than newly created * is a typing requirement. It must be possible that the class of the root * is derived from the original class element_impl, i.e. the user must be * able to add additional methods. If we created a new root object, we * would have to denote to which class the new object belongs; the root * would always be an 'element_impl' object (and not a derived object). * If we instead cloned an exemplar object and modified it by the * "create" method, the root object would belong to the same class as the * exemplar (good), but the type of the parsing function would always * state that an 'element_impl' was created (because we can pass the new * object only back via a global variable). The only solution is to * modify the object that has been passed to the parsing function directly. */ $ {{ dtd <- transform_dtd dtd; }} start_tag() content()* {{ () }} content(): /* parses: start tags, end tags, content, or processing * instructions. That the tags are properly nested is dynamically checked. * As result, recognized elements are added to their parent elements, * content is added to the element containing it, and processing instructions * are entered into the element embracing them. (All as side-effects.) */ start_tag() {{ () }} | end_tag() {{ () }} | char_data() {{ () }} | cref() {{ () }} | pi() {{ () }} | entity_ref() {{ () }} | comment() {{ () }} entity_ref(): Begin_entity eref_xmldecl_then_rest() {{ if n_tags_open = 0 then raise(WF_error("Entity reference not allowed here")) }} /* See comment for doc_mldecl_then_misc_then_prolog_then_rest. */ eref_xmldecl_then_rest(): pl:PI_xml $ {{ context.manager # current_entity # process_xmldecl pl; }} content()* End_entity {{ () }} | $ {{ context.manager # current_entity # process_missing_xmldecl; }} content() content()* End_entity {{ () }} | $ {{ context.manager # current_entity # process_missing_xmldecl; }} End_entity {{ () }} start_tag(): /* parses or . * * EFFECT: If elstack is non-empty, the element is added to the * top element of the stack as new child, and the element * is pushed on the stack. If elstack is empty, the root_exemplar is * modified and gets the parsed name and attribute list. The root_exemplar * is pushed on the stack. If additionally the variable root is empty, too, * this variable is initialized. * If the form has been parsed, no element is pushed * on the stack. */ tag: Tag_beg $ {{ let position = if config.store_element_positions then Some(context.manager # position) else None in }} ws: Ignore? Ignore* attlist: attribute()* emptiness: start_tag_rangle() /* Note: it is guaranteed that there is whitespace between Tag_beg and * the name of the first attribute, because there must be some separator. * So we need not to check ws! */ {{ let rec check_attlist al = match al with (nv1, num1) :: al' -> if not num1 && al' <> [] then begin match al with ((n1,_),_) :: ((n2,_),_) :: _ -> raise(WF_error("Whitespace is missing between attributes `" ^ n1 ^ "' and `" ^ n2 ^ "'")) | _ -> assert false end; check_attlist al' | [] -> () in check_attlist attlist; let name, tag_beg_entid = tag in let attlist' = List.map (fun (nv,_) -> nv) attlist in let d = create_element_node ?position:position spec dtd name attlist' in begin match id_index with None -> () | Some idx -> (* Put the ID attribute into the index, if present *) begin try let v = d # id_attribute_value in (* may raise Not_found *) idx # add v d (* may raise ID_not_unique *) with Not_found -> (* No ID attribute *) () | ID_not_unique -> (* There is already an ID with the same value *) raise(Validation_error("ID not unique")) end end; if n_tags_open = 0 then begin if root = None then begin (* We have found the begin tag of the root element. *) if config.enable_super_root_node then begin (* The user wants the super root instead of the real root. * The real root element becomes the child of the VR. *) (* Assertion: self # current is the super root *) assert (self # current # node_type = T_super_root); root <- Some (self # current); self # current # add_node d; doc # init_root (self # current); end else begin (* Normal behaviour: The user wants to get the real root. *) root <- Some d; doc # init_root d; end; end else (* We have found a second topmost element. This is illegal. *) raise(WF_error("Document must consist of only one toplevel element")) end else begin (* We have found some inner begin tag. *) self # save_data; (* Save outstanding data material first *) self # current # add_node d end; if emptiness then (* An empty tag like . *) d # local_validate ~use_dfa:config.validate_by_dfa () else begin (* A non-empty tag. *) Stack.push (d, tag_beg_entid) elstack; n_tags_open <- n_tags_open + 1; end; }} ? {{ match !yy_position with "attlist" -> raise(WF_error("Bad attribute list")) | "emptiness" -> raise(WF_error("`>' or `/>' expected")) | _ -> raise(WF_error("Bad start tag")) }} attribute(): /* Parses name="value" */ n:Name Ignore* Eq Ignore* v:attval() ws:Ignore? Ignore* {{ (n,v), ws }} attval(): v:Attval {{ expand_attvalue lexerset dtd v config.warner true }} | v:Attval_nl_normalized {{ expand_attvalue lexerset dtd v config.warner false }} start_tag_rangle(): Rangle {{ false }} | Rangle_empty {{ true }} end_tag(): /* parses . * Pops the top element from the elstack and checks if it is the same * element. */ tag:Tag_end Ignore* Rangle {{ let name, tag_end_entid = tag in if n_tags_open = 0 then raise(WF_error("End-tag without start-tag")); self # save_data; (* Save outstanding data material first *) let x, tag_beg_entid = Stack.pop elstack in let x_name = match x # node_type with | T_element n -> n | _ -> assert false in if name <> x_name then raise(WF_error("End-tag does not match start-tag")); if tag_beg_entid != tag_end_entid then raise(WF_error("End-tag not in the same entity as the start-tag")); x # local_validate ~use_dfa:config.validate_by_dfa (); n_tags_open <- n_tags_open - 1; assert (n_tags_open >= 0); }} char_data(): /* Parses any literal characters not otherwise matching, and adds the * characters to the top element of elstack. * If elstack is empty, it is assumed that there is no surrounding * element, and any non-white space character is forbidden. */ data:CharData {{ if n_tags_open = 0 then (* only white space is allowed *) self # only_whitespace data else self # collect_data data (* We collect the chardata material until the next end tag is * reached. Then the collected material will concatenated and * stored as a single T_data node (see end_tag rule above) * using save_data. *) }} | data:Cdata {{ if n_tags_open = 0 then raise (WF_error("CDATA section not allowed here")); self # collect_data data (* Also collect CDATA material *) }} cref(): /* Parses &#...; and adds the character to the top element of elstack. */ code:CRef {{ if n_tags_open = 0 then (* No surrounding element: character references are not allowed *) raise(WF_error("Character reference not allowed here")); self # collect_data (character config.encoding config.warner code) (* Also collect character references *) }} pi(): /* Parses (but not ). * If there is a top element in elstack, the processing instruction is added * to this element. */ pi: PI {{ let position = if config.store_element_positions then Some(context.manager # position) else None in let target,value = pi in if n_tags_open = 0 & not config.enable_super_root_node then doc # add_pinstr (new proc_instruction target value config.encoding) else begin (* Special case: if processing instructions are processed inline, * they are wrapped into T_pinstr nodes. *) if config.enable_pinstr_nodes then begin self # save_data; (* Save outstanding data material first *) let pinstr = new proc_instruction target value config.encoding in let wrapper = create_pinstr_node ?position:position spec dtd pinstr in wrapper # local_validate(); (* succeeds always *) self # current # add_node wrapper; end else (* Normal behaviour: Add the PI to the parent element. *) self # current # add_pinstr (new proc_instruction target value config.encoding) end }} comment(): /* Parses */ Comment_begin $ {{ let position = if config.enable_comment_nodes && config.store_element_positions then Some(context.manager # position) else None in }} mat: Comment_material* ce: Comment_end {{ if config.enable_comment_nodes then begin self # save_data; (* Save outstanding data material first *) let comment_text = String.concat "" mat in let wrapper = create_comment_node ?position:position spec dtd comment_text in wrapper # local_validate(); (* succeeds always *) self # current # add_node wrapper; end }} ? {{ match !yy_position with | "ce" -> raise(WF_error("`-->' expected")) | _ -> raise(WF_error("Bad comment")) }} %% (* The method "parse" continues here... *) try match start_symbol with Ext_document -> parse_ext_document context.current context.get_next | Ext_declarations -> parse_ext_declarations context.current context.get_next | Ext_element -> parse_ext_element context.current context.get_next with Not_found -> raise Parsing.Parse_error (*********** The method "parse" ends here *************) (**********************************************************************) (* Here ends the class definition: *) end ;; (**********************************************************************) open Pxp_reader;; class default_ext = object(self) val mutable node = (None : ('a extension node as 'a) option) method clone = {< >} method node = match node with None -> assert false | Some n -> n method set_node n = node <- Some n end ;; let default_extension = new default_ext;; let default_spec = make_spec_from_mapping ~super_root_exemplar: (new element_impl default_extension) ~comment_exemplar: (new element_impl default_extension) ~default_pinstr_exemplar: (new element_impl default_extension) ~data_exemplar: (new data_impl default_extension) ~default_element_exemplar: (new element_impl default_extension) ~element_mapping: (Hashtbl.create 1) () ;; let idref_pass id_index root = let error t att value = let name = match t # node_type with T_element name -> name | _ -> assert false in let text = "Attribute `" ^ att ^ "' of element `" ^ name ^ "' refers to unknown ID `" ^ value ^ "'" in let pos_ent, pos_line, pos_col = t # position in if pos_line = 0 then raise(Validation_error text) else raise(At("In entity " ^ pos_ent ^ " at line " ^ string_of_int pos_line ^ ", position " ^ string_of_int pos_col ^ ":\n", Validation_error text)) in let rec check_tree t = let idref_atts = t # idref_attribute_names in List.iter (fun att -> match t # attribute att with Value s -> begin try ignore(id_index # find s) with Not_found -> error t att s end | Valuelist l -> List.iter (fun s -> try ignore(id_index # find s) with Not_found -> error t att s ) l | Implied_value -> () ) idref_atts; List.iter check_tree (t # sub_nodes) in check_tree root ;; exception Return_DTD of dtd;; (* Used by extract_dtd_from_document_entity to jump out of the parser *) let call_parser ~configuration:cfg ~source:src ~dtd ~extensible_dtd ~document:doc ~specification:spec ~process_xmldecl ~transform_dtd ~(id_index : 'ext #index option) ~use_document_entity ~entry ~init_lexer = let e = cfg.errors_with_line_numbers in let w = cfg.warner in let r, en = match src with Entity(m,r') -> r', m dtd | ExtID(xid,r') -> r', if use_document_entity then new document_entity r' dtd "[toplevel]" w xid e cfg.encoding else new external_entity r' dtd "[toplevel]" w xid false e cfg.encoding in r # init_rep_encoding cfg.encoding; r # init_warner w; en # set_debugging_mode (cfg.debugging_mode); let pobj = new parser_object doc dtd extensible_dtd cfg r spec process_xmldecl transform_dtd (id_index :> 'ext index option) in let mgr = new entity_manager en in en # open_entity true init_lexer; begin try let context = make_context mgr in pobj # parse context entry; ignore(en # close_entity); with Return_DTD d -> ignore(en # close_entity); raise(Return_DTD d) | error -> ignore(en # close_entity); r # close_all; let pos = mgr # position_string in raise (At(pos, error)) end; if cfg.idref_pass then begin match id_index with None -> () | Some idx -> ( match pobj # root with None -> () | Some root -> idref_pass idx root; ) end; pobj let parse_dtd_entity cfg src = (* Parse a DTD given as separate entity. *) let dtd = new dtd cfg.warner cfg.encoding in let doc = new document cfg.warner in let pobj = call_parser ~configuration:cfg ~source:src ~dtd:dtd ~extensible_dtd:true (* Extend the DTD by parsed declarations *) ~document:doc ~specification:default_spec ~process_xmldecl:false (* The XML declaration is ignored * (except 'encoding') *) ~transform_dtd:(fun x -> x) (* Do not transform the DTD *) ~id_index: None ~use_document_entity:false ~entry:Ext_declarations (* Entry point of the grammar *) ~init_lexer:Declaration (* The initially used lexer *) in dtd # validate; if cfg.accept_only_deterministic_models then dtd # only_deterministic_models; dtd ;; let parse_content_entity ?id_index cfg src dtd spec = (* Parse an element given as separate entity *) dtd # validate; (* ensure that the DTD is valid *) if cfg.accept_only_deterministic_models then dtd # only_deterministic_models; let doc = new document cfg.warner in let pobj = call_parser ~configuration:cfg ~source:src ~dtd:dtd ~extensible_dtd:true (* Extend the DTD by parsed declarations *) ~document:doc ~specification:spec ~process_xmldecl:false (* The XML declaration is ignored * (except 'encoding') *) ~transform_dtd:(fun x -> x) (* Do not transform the DTD *) ~id_index:(id_index :> 'ext index option) ~use_document_entity:false ~entry:Ext_element (* Entry point of the grammar *) ~init_lexer:Content (* The initially used lexer *) in match pobj # root with Some r -> r | None -> raise(WF_error("No root element")) ;; let parse_wfcontent_entity cfg src spec = let dtd = new dtd cfg.warner cfg.encoding in dtd # allow_arbitrary; let doc = new document cfg.warner in let pobj = call_parser ~configuration:cfg ~source:src ~dtd:dtd ~extensible_dtd:false (* Do not extend the DTD *) ~document:doc ~specification:spec ~process_xmldecl:false (* The XML declaration is ignored * (except 'encoding') *) ~transform_dtd:(fun x -> x) (* Do not transform the DTD *) ~id_index:None ~use_document_entity:false ~entry:Ext_element (* Entry point of the grammar *) ~init_lexer:Content (* The initially used lexer *) in match pobj # root with Some r -> r | None -> raise(WF_error("No root element")) ;; let iparse_document_entity ?(transform_dtd = (fun x -> x)) ?id_index cfg0 src spec p_wf = (* Parse an element given as separate entity *) (* p_wf: 'true' if in well-formedness mode, 'false' if in validating mode *) let cfg = { cfg0 with recognize_standalone_declaration = cfg0.recognize_standalone_declaration && (not p_wf) } in let dtd = new dtd cfg.warner cfg.encoding in if p_wf then dtd # allow_arbitrary; let doc = new document cfg.warner in let pobj = call_parser ~configuration:cfg ~source:src ~dtd:dtd ~extensible_dtd:(not p_wf) (* Extend the DTD by parsed declarations * only if in validating mode *) ~document:doc ~specification:spec ~process_xmldecl:true (* The XML declaration is processed *) (* TODO: change to 'not p_wf' ? *) ~transform_dtd:(fun dtd -> let dtd' = transform_dtd dtd in if cfg.accept_only_deterministic_models then dtd' # only_deterministic_models; dtd') ~id_index:(id_index :> 'ext index option) ~use_document_entity:true ~entry:Ext_document (* Entry point of the grammar *) ~init_lexer:Document (* The initially used lexer *) in pobj # doc ;; let parse_document_entity ?(transform_dtd = (fun x -> x)) ?id_index cfg src spec = iparse_document_entity ~transform_dtd:transform_dtd ?id_index:(id_index : 'ext #index option :> 'ext index option) cfg src spec false;; let parse_wfdocument_entity cfg src spec = iparse_document_entity cfg src spec true;; let extract_dtd_from_document_entity cfg src = let transform_dtd dtd = raise (Return_DTD dtd) in try let doc = parse_document_entity ~transform_dtd:transform_dtd cfg src default_spec in (* Should not happen: *) doc # dtd with Return_DTD dtd -> (* The normal case: *) dtd ;; let default_config = let w = new drop_warnings in { warner = w; errors_with_line_numbers = true; enable_pinstr_nodes = false; enable_super_root_node = false; enable_comment_nodes = false; encoding = `Enc_iso88591; recognize_standalone_declaration = true; store_element_positions = true; idref_pass = false; validate_by_dfa = true; accept_only_deterministic_models = true; debugging_mode = false; } class [ 'ext ] hash_index = object constraint 'ext = 'ext node #extension val ht = (Hashtbl.create 100 : (string, 'ext node) Hashtbl.t) method add s n = try ignore(Hashtbl.find ht s); raise ID_not_unique with Not_found -> Hashtbl.add ht s n method find s = Hashtbl.find ht s method index = ht end (* ====================================================================== * History: * * $Log$ * Revision 1.1 2000/11/17 09:57:29 lpadovan * Initial revision * * Revision 1.14 2000/08/26 23:23:14 gerd * Bug: from_file must not interpret the file name as URL path. * Bug: When PI and comment nodes are generated, the collected data * material must be saved first. * * Revision 1.13 2000/08/19 21:30:03 gerd * Improved the error messages of the parser * * Revision 1.12 2000/08/18 20:16:25 gerd * Implemented that Super root nodes, pinstr nodes and comment * nodes are included into the document tree. * * Revision 1.11 2000/08/14 22:24:55 gerd * Moved the module Pxp_encoding to the netstring package under * the new name Netconversion. * * Revision 1.10 2000/07/23 02:16:33 gerd * Support for DFAs. * * Revision 1.9 2000/07/14 13:57:29 gerd * Added the id_index feature. * * Revision 1.8 2000/07/09 17:52:45 gerd * New implementation for current_data. * The position of elements is stored on demand. * * Revision 1.7 2000/07/09 01:00:35 gerd * Improvement: It is now guaranteed that only one data node * is added for consecutive character material. * * Revision 1.6 2000/07/08 16:27:29 gerd * Cleaned up the functions calling the parser. * New parser argument: transform_dtd. * Implementations for 'extract_dtd_from_document_entity' and * 'parse_wfcontent_entity'. * * Revision 1.5 2000/07/06 23:05:18 gerd * Initializations of resolvers were missing. * * Revision 1.4 2000/07/06 22:11:01 gerd * Fix: The creation of the non-virtual root element is protected * in the same way as the virtual root element. * * Revision 1.3 2000/07/04 22:15:18 gerd * Change: Using the new resolver capabilities. * Still incomplete: the new extraction and parsing functions. * * Revision 1.2 2000/06/14 22:19:06 gerd * Added checks such that it is impossible to mix encodings. * * Revision 1.1 2000/05/29 23:48:38 gerd * Changed module names: * Markup_aux into Pxp_aux * Markup_codewriter into Pxp_codewriter * Markup_document into Pxp_document * Markup_dtd into Pxp_dtd * Markup_entity into Pxp_entity * Markup_lexer_types into Pxp_lexer_types * Markup_reader into Pxp_reader * Markup_types into Pxp_types * Markup_yacc into Pxp_yacc * See directory "compatibility" for (almost) compatible wrappers emulating * Markup_document, Markup_dtd, Markup_reader, Markup_types, and Markup_yacc. * * ====================================================================== * Old logs from markup_yacc.m2y: * * Revision 1.9 2000/05/29 21:14:57 gerd * Changed the type 'encoding' into a polymorphic variant. * * Revision 1.8 2000/05/27 19:26:19 gerd * Change: The XML declaration is interpreted right after * it has been parsed (no longer after the document): new function * check_and_parse_xmldecl. * When elements, attributes, and entities are declared * it is stored whether the declaration happens in an external * entity (for the standalone check). * The option recognize_standalone_declaration is interpreted. * * Revision 1.7 2000/05/20 20:31:40 gerd * Big change: Added support for various encodings of the * internal representation. * * Revision 1.6 2000/05/14 21:51:24 gerd * Change: Whitespace is handled by the grammar, and no longer * by the entity. * * Revision 1.5 2000/05/14 17:50:54 gerd * Updates because of changes in the token type. * * Revision 1.4 2000/05/11 22:09:17 gerd * Fixed the remaining problems with conditional sections. * This seems to be also a weakness of the XML spec! * * Revision 1.3 2000/05/09 00:02:44 gerd * Conditional sections are now recognized by the parser. * There seem some open questions; see the TODO comments! * * Revision 1.2 2000/05/08 22:01:44 gerd * Introduced entity managers (see markup_entity.ml). * The XML declaration is now recognized by the parser. If such * a declaration is found, the method process_xmldecl of the currently * active entity is called. If the first token is not an XML declaration, * the method process_missing_xmldecl is called instead. * Some minor changes. * * Revision 1.1 2000/05/06 23:21:49 gerd * Initial revision. * * * ====================================================================== * * COPIED FROM REVISION 1.19 OF markup_yacc.mly * * Revision 1.19 2000/05/01 15:20:08 gerd * "End tag matches start tag" is checked before "End tag in the * same entity as start tag". * * Revision 1.18 2000/04/30 18:23:08 gerd * Bigger change: Introduced the concept of virtual roots. First, * this reduces the number of checks. Second, it makes it possible to * return the virtual root to the caller instead of the real root (new * config options 'virtual_root' and 'processing_instructions_inline'). * Minor changes because of better CR/CRLF handling. * * Revision 1.17 2000/03/13 23:47:46 gerd * Updated because of interface changes. (See markup_yacc_shadow.mli * rev. 1.8) * * Revision 1.16 2000/01/20 20:54:43 gerd * New config.errors_with_line_numbers. * * Revision 1.15 1999/12/17 22:27:58 gerd * Bugfix: The value of 'p_internal_subset' (an instance * variable of the parser object) is to true when the internal subset * begins, and is set to false when this subset ends. The error was * that references to external entities within this subset did not * set 'p_internal_subset' to false; this is now corrected by introducing * the 'p_internal_subset_stack'. * This is a typical example of how the code gets more and * more complicated and that it is very difficult to really understand * what is going on. * * Revision 1.14 1999/11/09 22:23:37 gerd * Removed the invocation of "init_dtd" of the root document. * This method is no longer available. The DTD is also passed to the * document object by the root element, so nothing essential changes. * * Revision 1.13 1999/10/25 23:37:09 gerd * Bugfix: The warning "More than one ATTLIST declaration for element * type ..." is only generated if an ATTLIST is found while there are already * attributes for the element. * * Revision 1.12 1999/09/01 23:08:38 gerd * New frontend function: parse_wf_document. This simply uses * a DTD that allows anything, and by the new parameter "extend_dtd" it is * avoided that element, attlist, and notation declarations are added to this * DTD. The idea is that this function simulates a well-formedness parser. * Tag_beg, Tag_end carry the entity_id. The "elstack" stores the * entity_id of the stacked tag. This was necessary because otherwise there * are some examples to produces incorrectly nested elements. * p_internal_subset is a variable that stores whether the internal * subset is being parsed. This is important beacause entity declarations in * internal subsets are not allowed to contain parameter references. * It is checked if the "elstack" is empty after all has been parsed. * Processing instructions outside DTDs and outside elements are now * added to the document. * The rules of mixed and regexp style content models have been * separated. The code is now much simpler. * Entity references outside elements are detected and rejected. * * Revision 1.11 1999/09/01 16:26:08 gerd * Improved the quality of error messages. * * Revision 1.10 1999/08/31 19:13:31 gerd * Added checks on proper PE nesting. The idea is that tokens such * as Decl_element and Decl_rangle carry an entity ID with them. This ID * is simply an object of type < >, i.e. you can only test on identity. * The lexer always produces tokens with a dummy ID because it does not * know which entity is the current one. The entity layer replaces the dummy * ID with the actual ID. The parser checks that the IDs of pairs such as * Decl_element and Decl_rangle are the same; otherwise a Validation_error * is produced. * * Revision 1.9 1999/08/15 20:42:01 gerd * Corrected a misleading message. * * Revision 1.8 1999/08/15 20:37:34 gerd * Improved error messages. * Bugfix: While parsing document entities, the subclass document_entity is * now used instead of external_entity. The rules in document entities are a bit * stronger. * * Revision 1.7 1999/08/15 14:03:59 gerd * Empty documents are not allowed. * "CDATA section not allowed here" is a WF_error, not a Validation_ * error. * * Revision 1.6 1999/08/15 02:24:19 gerd * Removed some grammar rules that were used for testing. * Documents without DTD can now have arbitrary elements (formerly * they were not allowed to have any element). * * Revision 1.5 1999/08/14 22:57:20 gerd * It is allowed that external entities are empty because the * empty string is well-parsed for both declarations and contents. Empty * entities can be referenced anywhere because the references are replaced * by nothing. Because of this, the Begin_entity...End_entity brace is only * inserted if the entity is non-empty. (Otherwise references to empty * entities would not be allowed anywhere.) * As a consequence, the grammar has been changed such that a * single Eof is equivalent to Begin_entity,End_entity without content. * * Revision 1.4 1999/08/14 22:20:01 gerd * The "config" slot has now a component "warner" which is * an object with a "warn" method. This is used to warn about characters * that cannot be represented in the Latin 1 alphabet. * Furthermore, there is a new component "debugging_mode". * Some Parse_error exceptions have been changed into Validation_error. * The interfaces of functions/classes imported from other modules * have changed; the invocations have been adapted. * Contents may contain CDATA sections that have been forgotten. * * Revision 1.3 1999/08/11 15:00:41 gerd * The Begin_entity ... End_entity brace is also possible in * 'contents'. * The configuration passed to the parsing object contains always * the resolver that is actually used. * * Revision 1.2 1999/08/10 21:35:12 gerd * The XML/encoding declaration at the beginning of entities is * evaluated. In particular, entities have now a method "xml_declaration" * which returns the name/value pairs of such a declaration. The "encoding" * setting is interpreted by the entity itself; "version", and "standalone" * are interpreted by Markup_yacc.parse_document_entity. Other settings * are ignored (this does not conform to the standard; the standard prescribes * that "version" MUST be given in the declaration of document; "standalone" * and "encoding" CAN be declared; no other settings are allowed). * TODO: The user should be warned if the standard is not exactly * fulfilled. -- The "standalone" property is not checked yet. * * Revision 1.1 1999/08/10 00:35:52 gerd * Initial revision. * * *)