X-Git-Url: http://matita.cs.unibo.it/gitweb/?a=blobdiff_plain;f=helm%2FDEVEL%2Fpxp%2Fpxp%2Fpxp_yacc.m2y;fp=helm%2FDEVEL%2Fpxp%2Fpxp%2Fpxp_yacc.m2y;h=91de7cd2f06ff6ec81ee36ca1c85b696eaebf37d;hb=c03d2c1fdab8d228cb88aaba5ca0f556318bebc5;hp=0000000000000000000000000000000000000000;hpb=758057e85325f94cd88583feb1fdf6b038e35055;p=helm.git diff --git a/helm/DEVEL/pxp/pxp/pxp_yacc.m2y b/helm/DEVEL/pxp/pxp/pxp_yacc.m2y new file mode 100644 index 000000000..91de7cd2f --- /dev/null +++ b/helm/DEVEL/pxp/pxp/pxp_yacc.m2y @@ -0,0 +1,2528 @@ +(* $Id$ -*- tuareg -*- + * ---------------------------------------------------------------------- + * PXP: The polymorphic XML parser for Objective Caml. + * Copyright by Gerd Stolpmann. See LICENSE for details. + *) + +open Parsing +open Pxp_types +open Pxp_lexer_types +open Pxp_dtd +open Pxp_entity +open Pxp_document +open Pxp_aux + +(* Some types from the interface definition: *) + +exception ID_not_unique + +class type [ 'ext ] index = +object + constraint 'ext = 'ext node #extension + method add : string -> 'ext node -> unit + method find : string -> 'ext node +end + + +type config = + { warner : collect_warnings; + errors_with_line_numbers : bool; + enable_pinstr_nodes : bool; + enable_super_root_node : bool; + enable_comment_nodes : bool; + encoding : rep_encoding; + recognize_standalone_declaration : bool; + store_element_positions : bool; + idref_pass : bool; + validate_by_dfa : bool; + accept_only_deterministic_models : bool; + debugging_mode : bool; + } + +type source = + Entity of ((dtd -> Pxp_entity.entity) * Pxp_reader.resolver) + | ExtID of (ext_id * Pxp_reader.resolver) + + +type start_symbol = + Ext_document + | Ext_declarations + | Ext_element + + +type context = + { mutable current : unit -> token; (* get the current token *) + mutable get_next : unit -> token; (* go on to the next token; return it *) + mutable current_token : token; (* This is the current token *) + mutable manager : entity_manager; (* The entity manager *) + } + + +let make_context entity_manager = + let c = + { current = (fun _ -> assert false); + get_next = (fun _ -> assert false); + current_token = Eof; + manager = entity_manager; + } + in + (* Note that the function which is stored in get_next_ref can be changed + * as a side-effect when an entity is opened or closed. The function in + * c.get_next must be programmed such that always the current "get_next" + * function is executed. + *) + let get_next_ref = entity_manager # yy_get_next_ref in + c.current <- (fun () -> c.current_token); + c.get_next <- (fun () -> let tok = !get_next_ref() in + c.current_token <- tok; + tok); + ignore(c.get_next()); + c +;; + + +let from_channel ?system_encoding ?id:init_id ?fixenc ch = + + (* Reading from a channel works by modifying the algorithm of + * resolve_as_file. + *) + + let url_syntax = (* A syntax suitable for "file" URLs *) + { Neturl.null_url_syntax with + Neturl.url_enable_scheme = Neturl.Url_part_allowed; + Neturl.url_enable_host = Neturl.Url_part_allowed; + Neturl.url_enable_path = Neturl.Url_part_required; + Neturl.url_accepts_8bits = true; + } + in + + let an_url = + Neturl.make_url + ~scheme: "file" + ~host: "" + ~path: [ "" ] + url_syntax + in + + let init_channel_done = ref false in + (* Whether the first access to this source has already happened. *) + + (* The task of url_of_id is: + * - When it is called the first time, and no init_id is present, + * the URL file:/// is passed back (an_url). This forces that + * absolute path names /path/dir/... will be interpreted as + * file path names. (But relative path names will not work.) + * - If an init_id has been passed, we can assume that the opened URL + * is exactly this init_id. By raising Not_competent it is indicated + * that the standard method is to be used for the interpretation of + * the URL. + * - Otherwise, the channel is already being read, and thus cannot again + * opened. (This case is handled in channel_of_url.) + *) + + let url_of_id xid = + if !init_channel_done then begin + (* Use the normal way of determining the URL of the ID: *) + raise Pxp_reader.Not_competent + end + else begin + match init_id with + None -> + an_url + (* If the channel is not associated with any URL: Simply pass + * the URL file:/// back. + *) + | Some the_init_id -> + assert (the_init_id = xid); + raise Pxp_reader.Not_competent + (* If the channel is associated with a URL, the corresponding + * ID must be passed when the first invocation happens. + *) + end + in + + (* The task of channel_of_url: + * - If it is called the first time ("else"), the channel is returned + * - Otherwise, the channel is already being read, and thus cannot again + * opened. By raising Not_competent it is signaled that the + * resolve_as_file object must not continue to open the URL. + *) + + let channel_of_url url = + if !init_channel_done then + raise Pxp_reader.Not_competent + else begin + init_channel_done := true; + ch, fixenc + end + in + + let r = + new Pxp_reader.resolve_as_file + ?system_encoding:system_encoding + ~url_of_id:url_of_id + ~channel_of_url:channel_of_url + () + in + + let init_xid = + match init_id with + None -> Anonymous + | Some id -> + (* Note: 'id' may be illegal (malformed); in this case, the first + * invocation of url_of_id will raise Not_competent, and the 'open_in' + * method will fail. + *) + id + in + + ExtID(init_xid, r) +;; + + +let from_file ?system_encoding utf8_filename = + + let r = + new Pxp_reader.resolve_as_file + ?system_encoding:system_encoding + () + in + + let utf8_abs_filename = + if utf8_filename <> "" && utf8_filename.[0] = '/' then + utf8_filename + else + Sys.getcwd() ^ "/" ^ utf8_filename + in + + let syntax = { Neturl.ip_url_syntax with Neturl.url_accepts_8bits = true } in + let url = Neturl.make_url + ~scheme:"file" + ~host:"localhost" + ~path:(Neturl.split_path utf8_abs_filename) + syntax + in + + let xid = System (Neturl.string_of_url url) in + + + ExtID(xid, r) +;; + + +let from_string ?fixenc s = + let r = + new Pxp_reader.resolve_read_this_string ?fixenc:fixenc s in + ExtID(Anonymous, r) +;; + + +(**********************************************************************) + +class ['ext] parser_object + init_doc init_dtd init_extend_dtd init_config init_resolver init_spec + init_process_xmldecl transform_dtd id_index + = + object (self) + + (* Note that the 'ext parameter has been the motivation to make the + * parser a class. + *) + + val mutable dtd = init_dtd + (* The DTD being parsed; or the DTD currently assumed *) + + val extend_dtd = init_extend_dtd + (* Whether the DTD should be extended by ELEMENT, ATTLIST, and + * NOTATION declarations or not. (True for validating mode, + * false for well-formedness mode.) + *) + + val transform_dtd = transform_dtd + (* A function transforming the DTD *) + + val id_index = (id_index : 'ext index option) + (* The ID index or None *) + + val process_xmldecl = init_process_xmldecl + (* Whether the XML declaration is parsed and the found XML version + * and standalone declaration are passed to 'doc'. + *) + + val lexerset = Pxp_lexers.get_lexer_set (init_config.encoding) + + val doc = init_doc + (* The current document *) + + method doc = (doc : 'ext document) + + val resolver = init_resolver + (* The resolver for external IDs *) + + val config = init_config + (* The current configuration *) + + val elstack = (Stack.create() : ('ext node * entity_id) Stack.t) + (* The element stack containing all open elements, i.e. elements that + * have begun by a start tag but that have not been finished (end tag). + * If the parser sees a start tag, it creates the element and pushes it + * on top of this stack. If the parser recognizes an end tag, it pulls + * one element from the stack and checks if it has the same name as + * given with the end tag. + * + * At initialization time, a special element is pushed on the stack, + * the so-called super root. It is always the bottommost + * element of the stack, and serves as a guard. + * [See "initializer" below.] + *) + + method current = + (* Get the top element of the element stack *) + try + fst(Stack.top elstack) + with + Stack.Empty -> assert false + (* Not possible, because the super root is always the element + * at the bottom of the stack. + *) + + val mutable n_tags_open = 0 + (* Number of begin tags that have been parsed and whose corresponding + * end tags have not yet been parsed + *) + + val mutable p_internal_subset = false + (* true while parsing the internal subset - there are some additional + * constraints for internal subsets, and because of this it must + * be known whether the current declaration is contained in the + * internal or external subset of the DTD. + *) + + val mutable root = None + (* Contains the root element (topmost element) while it is being parsed + * and after it has been parsed. + * This variable is None before the root element is seen. + *) + + method root = root + + val spec = init_spec + (* A hashtable that contains exemplar objects for the various element + * types. If an element is parsed, the exemplar is looked up and + * "cloned" (by the "create" method) + *) + + val mutable current_data = [] + (* Collects character data. *) + + method collect_data s = + (* Collects the character material 's' *) + current_data <- s :: current_data + + method save_data = + (* Puts the material collected in 'current_data' into a new + * node, and appends this node as new sub node to 'current' + *) + match current_data with + [] -> + () + | [ str ] -> + if str <> "" then + self # current # add_node (create_data_node spec dtd str); + current_data <- [] + | _ -> + let count = List.fold_left + (fun acc s -> acc + String.length s) + 0 + current_data in + let str = String.create count in + let pos = ref count in + List.iter + (fun s -> + let l = String.length s in + pos := !pos - l; + String.blit + ~src:s + ~src_pos:0 + ~dst:str + ~dst_pos:(!pos) + ~len:l + ) + current_data; + assert(!pos = 0); + if str <> "" then + self # current # add_node (create_data_node spec dtd str); + current_data <- [] + + + method only_whitespace data = + (* Checks that the string "data" contains only whitespace. On failure, + * Validation_error is raised. + *) + let lexbuf = Lexing.from_string data in + let t1 = lexerset.scan_name_string lexbuf in + if t1 <> Ignore then + raise(WF_error("Data not allowed here")); + let t2 = lexerset.scan_name_string lexbuf in + if t2 <> Eof then + raise(WF_error("Data not allowed here")); + () + + initializer + (* CHECKS: *) + if config.encoding <> dtd # encoding then + failwith("Encoding mismatch"); + + (* --- Initialize 'elstack': Push the super-root on the stack. *) + let super_root = + if config.enable_super_root_node then + create_super_root_node spec dtd + else + (* because spec may not contain an exemplar for the super root: *) + create_no_node spec dtd + in + (* Move the super root or the emulation to the stack: *) + Stack.push (super_root, (self :> entity_id)) elstack; + + + + (********* Here the method "parse" begins. The grammar below is + * transformed to a local function of this method + *) + + method parse context start_symbol = + + let parse_ignored_section yy_current yy_get_next = + (* A special parser which should be used after is found. + *) + + while yy_current() = Ignore do + ignore(yy_get_next()); + done; + + ( match yy_current() with + Conditional_body _ -> () + | _ -> raise Parsing.Parse_error; + ); + + let en = context.manager # current_entity in + let llev = ref 1 in + while !llev >= 1 do + let igntok = en # next_ignored_token in + (* next_ignored_token: uses a special lexer that only + * recognizes Conditional_begin and Conditional_end; + * other character combinations are ignored. + *) + (* NOTE: next_ignored_token works much like yy_get_next, + * but it does not set the current token! + *) + match igntok with + Conditional_begin _ -> + llev := !llev + 1 + | Conditional_end _ -> + llev := !llev - 1; + (* Because the loop may be exited now: *) + context.current_token <- igntok; + | (End_entity | Eof) -> + raise Parsing.Parse_error + | _ -> + () + done; + + in + + + let check_and_parse_xmldecl xmldecl = + if process_xmldecl then begin + let v, _, s = decode_doc_xml_pi (decode_xml_pi xmldecl) in + check_version_num v; + doc # init_xml_version v; + let v = match s with + None -> false + | Some "yes" -> true + | Some "no" -> false + | _ -> raise (WF_error("Illegal 'standalone' declaration")) + in + if config.recognize_standalone_declaration then + dtd # set_standalone_declaration v + end + in + + let recode_utf8 s = + (* Recode 's' to UTF-8 *) + if config.encoding = `Enc_utf8 then + s (* No recoding necessary *) + else + Netconversion.recode_string + ~in_enc:(config.encoding :> encoding) ~out_enc:`Enc_utf8 s + in + + +%% + +/* The following grammar looks similar to ocamlyacc grammars, but + * ocamlyacc is actually not used to transform the grammar into a parser. + * Instead, the parser generator m2parsergen is applied. + * + * The format of the grammar is different (see m2parsergen/README), + * but I hope that you can understand most features immediately. + * + * The type of the parser is different: m2parsergen creates a top-down + * parser while ocamlyacc generates a LALR-1 parser. + * + * The way the generated code is called is different: ocamlyacc produces + * lots of top-level definitions whereas m2parsergen generates only + * a local let-in-phrase. This is explained in the already mentioned + * README file. + */ + +/* See Pxp_types.ml for comments to the various tokens */ + +%token Begin_entity +%token End_entity +%token Comment_begin +%token Comment_end +%token Ignore +%token Eq +%token Rangle +%token Rangle_empty +%token <> Conditional_begin +%token <> Conditional_body +%token <> Conditional_end +%token Percent +%token Plus +%token Star +%token Bar +%token Comma +%token Qmark +%token Pcdata +%token Required +%token Implied +%token Fixed +%token Eof + +%token <> Comment_material +%token <> Doctype +%token <> Doctype_rangle +%token <> Dtd_begin +%token <> Dtd_end +%token <> Decl_element +%token <> Decl_attlist +%token <> Decl_entity +%token <> Decl_notation +%token <> Decl_rangle +%token <> Lparen +%token <> Rparen +%token <> RparenPlus +%token <> RparenStar +%token <> RparenQmark + +%token <> Tag_beg +%token <> Tag_end + +%token <> PI +%token <> PI_xml +%token <> Cdata +%token <> CRef +%token <> ERef +%token <> PERef +%token <> CharData +%token <> LineEnd +%token <> Name +%token <> Nametoken +%token <> Attval +%token <> Attval_nl_normalized +%token <> Unparsed_string + +/* START SYMBOLS: + * + * "ext_document": parses a complete XML document (i.e. containing a + * and an element) + * "ext_declarations": parses an "external DTD subset", i.e. a sequence + * of declarations + * "ext_element": parses a single element (no allowed); + * the element needs not to be the root element of the + * DTD + * + * The functions corresponding to these symbols return always () because + * they only have side-effects. + */ + +/* SOME GENERAL COMMENTS: + * + * The parser does not get its tokens from the lexers directly. Instead of + * this, there is an entity object between the parser and the lexers. This + * object already handles: + * + * - References to general and parameter entities. The token stream is + * modified such that tokens automatically come from the referenced entities. + * External parameter entities and all general entities are embraced by + * the two special tokens Begin_entity and End_entity. The parser must + * check that these braces are correctly nested. + */ + +%% + + +ext_document(): + Begin_entity + doc_xmldecl_then_misc_then_prolog_then_rest() End_entity + {{ + if n_tags_open <> 0 then + raise(WF_error("Missing end tag")) + }} + + +/* In the following rule, we must find out whether there is an XML declaration + * or not, and directly after that either "process_xmldecl" or + * "process_missing_xmldecl" of the current entity must be called. + * AND IT MUST BE DIRECTLY! Because of this, the invocation is carried out + * in the "$" clause immediately following the first token. + * + * TODO: This is not enough. The first token may be a tag, and the tag + * may already contain non-ASCII characters. (But in this case, the resolvers + * assume UTF8, and they are right...) + */ + +doc_xmldecl_then_misc_then_prolog_then_rest(): + pl:PI_xml + $ {{ context.manager # current_entity # process_xmldecl pl; + check_and_parse_xmldecl pl; + }} + misc()* doc_prolog_then_rest() + {{ () }} + +| $ {{ context.manager # current_entity # process_missing_xmldecl; }} + misc() misc()* doc_prolog_then_rest() + {{ () }} + +| $ {{ context.manager # current_entity # process_missing_xmldecl; }} + doctypedecl() misc()* contents_start() + {{ () }} + +| $ {{ context.manager # current_entity # process_missing_xmldecl; }} + contents_start() + {{ () }} + + +doc_prolog_then_rest(): + doctypedecl() misc()* contents_start() + {{ () }} +| contents_start() + {{ () }} + + +ext_element(): + Begin_entity el_xmldecl_then_misc_then_rest() End_entity + {{ + if n_tags_open <> 0 then + raise(WF_error("Missing end tag")) + }} + + +/* See comment for doc_mldecl_then_misc_then_prolog_then_rest. */ + +el_xmldecl_then_misc_then_rest(): + pl:PI_xml + $ {{ context.manager # current_entity # process_xmldecl pl; }} + misc()* contents_start() + {{ () }} + +| $ {{ context.manager # current_entity # process_missing_xmldecl; }} + misc() misc()* contents_start() + {{ () }} + +| $ {{ context.manager # current_entity # process_missing_xmldecl; }} + contents_start() + {{ () }} + + +ext_declarations(): + /* Parses a sequence of declarations given by an entity. As side-effect, + * the parsed declarations are put into the dtd object. + */ + Begin_entity decl_xmldecl_then_rest() + {{ () }} +| Eof + {{ () }} + + +decl_xmldecl_then_rest(): + /* Note: This rule is also called from declaration()! */ + pl:PI_xml + $ {{ context.manager # current_entity # process_xmldecl pl; + }} + declaration()* End_entity + {{ () }} + +| $ {{ context.manager # current_entity # process_missing_xmldecl; }} + declaration() declaration()* End_entity + {{ () }} + +| $ {{ context.manager # current_entity # process_missing_xmldecl; }} + End_entity + {{ () }} + + +misc(): + pi() + {{ () }} +| data: CharData + /* In this context, the lexers sometimes do not recognize white space; + * instead CharData tokens containing white space are delivered. + */ + {{ self # only_whitespace data }} +| Ignore + {{ () }} +| comment() + {{ () }} + + +/********************* DOCUMENT TYPE DECLARATION *************************/ + +doctypedecl(): + /* parses from . As side-effect, first the declarations of + * the internal DTD (if any) are put into !!on_dtd, then the declarations + * of the external DTD (if any) are put into this DTD object. + */ + doctype_entid: Doctype + ws: Ignore Ignore* + doctypedecl_material (doctype_entid) + {{ () }} + ? {{ match !yy_position with + "ws" -> raise(WF_error("Whitespace is missing after `DOCTYPE'")) + | _ -> raise(WF_error("Bad DOCTYPE declaration")) + }} + + +/* TRICK: + * ws: Ignore? Ignore* + * is meant seriously. The effect is that ws becomes a boolean variable + * which is true if there is an Ignore token and false otherwise. + * This construct is faster than just + * ws: Ignore* + * in which case ws becomes an integer variable containing the number of + * Ignore tokens. Counting the number of tokens is slower than only checking + * the existence. + * + * We need the information whether there is an Ignore token (representing + * white space), because white space is only obligatory if also an identifier + * for the external subset is parsed; this conditional syntax constraint is + * simply programmed in the body of the grammar rule. + */ + +doctypedecl_material(doctype_entid): + root_name: Name + ws: Ignore? Ignore* + external_subset: external_id()? + Ignore* + internal_subset: internal_dtd()? + Ignore* + doctype_rangle_entid: Doctype_rangle + {{ + if doctype_entid != doctype_rangle_entid then + raise (Validation_error("Entities not properly nested with DOCTYPE declaration")); + dtd # set_root root_name; + begin match external_subset, internal_subset with + None, None -> () (* no DTD means no ID *) + | None, Some _ -> dtd # set_id Internal + | Some id, None -> dtd # set_id (External id) + | Some id, Some _ -> dtd # set_id (Derived id) + end; + (* Get now the external doctype declaration. Note that the internal + * subset has precedence and must be read first. + *) + begin match external_subset with + None -> () + | Some id -> + if not ws then + raise(WF_error("Whitespace is missing after `DOCTYPE " ^ + root_name ^ "'")); + let r' = resolver # clone in + let pobj = + new parser_object + (new document config.warner) + dtd + extend_dtd + config + r' + spec + process_xmldecl + (fun x -> x) + None + in + let en = new external_entity r' dtd "[dtd]" + config.warner id false config.errors_with_line_numbers + config.encoding + in + en # set_debugging_mode (config.debugging_mode); + let mgr = new entity_manager en in + en # open_entity true Declaration; + try + let context = make_context mgr in + pobj # parse context Ext_declarations; + ignore(en # close_entity); + with + error -> + ignore(en # close_entity); + r' # close_all; + let pos = mgr # position_string in + raise (At(pos, error)) + end; + dtd # validate + }} + ? {{ + match !yy_position with + "doctype_rangle_entid" -> raise(WF_error("`>' expected")) + | _ -> raise(WF_error("Bad DOCTYPE declaration")) + }} + +/* Note that there are no keywords for SYSTEM or PUBLIC, as these would + * be difficult to recognize in the lexical contexts. Because of this, + * SYSTEM/PUBLIC is parsed as name, and the rule for everything after + * SYSTEM/PUBLIC is computed dynamically. + */ + +external_id(): + tok:Name + $ {{ + let followup = + match tok with + "SYSTEM" -> parse_system_id + (* Apply the rule system_id (below) to parse the + * rest of the ID + *) + | "PUBLIC" -> parse_public_id + (* Apply the rule public_id (below) to parse the + * rest of the ID + *) + | _ -> raise(WF_error("SYSTEM or PUBLIC expected")) + in + }} + ws:Ignore Ignore* + r:[followup]() + {{ r }} + ? {{ match !yy_position with + "ws" -> raise(WF_error("Whitespace is missing after " ^ tok)) + | _ -> raise(WF_error("Bad SYSTEM or PUBLIC identifier")) + }} + + +system_id(): + str:Unparsed_string + {{ System (recode_utf8 str) }} + + +public_id(): + str1: Unparsed_string + ws: Ignore Ignore* + str2: Unparsed_string + {{ check_public_id str1; + Public(recode_utf8 str1, recode_utf8 str2) + }} + ? {{ match !yy_position with + "ws" -> raise(WF_error("Whitespace is missing between the literals of the PUBLIC identifier")) + | _ -> raise(WF_error("Bad PUBLIC identifier")) + }} + + +/* The internal subset: "[" declaration* "]". While parsing the declarations + * the object variable p_internal_subset must be true; however, if there + * are entity references, this variable must be reset to false during + * the entity. (See the rule for "declaration" below.) + */ + +internal_dtd(): + dtd_begin_entid: internal_dtd_begin() + declaration()* + dtd_end_entid: internal_dtd_end() + {{ + if dtd_begin_entid != dtd_end_entid then + raise(Validation_error("Entities not properly nested with internal DTD subset")) + }} + ? {{ match !yy_position with + "dtd_end_entid" -> raise(WF_error("`]' expected")) + | _ -> raise(WF_error("Bad internal DTD subset")) + }} + + +internal_dtd_begin(): + Dtd_begin + {{ assert (not p_internal_subset); + p_internal_subset <- true }} + + +internal_dtd_end(): + Dtd_end + {{ assert p_internal_subset; + p_internal_subset <- false }} + + +declaration(): + /* Parses a single declaration (or processing instruction). As side-effect + * the parsed declaration is stored into the dtd object. + */ + elementdecl() + {{ () }} +| attlistdecl() + {{ () }} +| entid:Decl_entity ws:Ignore Ignore* e:entitydecl(entid) + {{ () }} + ? {{ match !yy_position with + "ws" -> raise(WF_error("Whitespace is missing after ENTITY")) + | "e" -> raise(WF_error("Name or `%' expected")) + | _ -> raise(WF_error("Bad entity declaration")) + }} +| notationdecl() + {{ () }} +| pi: PI + {{ let target, value = pi in + let pi = new proc_instruction target value config.encoding in + dtd # add_pinstr pi + }} +| Ignore + {{ () }} +| Comment_begin Comment_material* ce:Comment_end + {{ () }} + ? {{ match !yy_position with + "ce" -> raise(WF_error("`-->' expected")) + | _ -> raise(WF_error("Bad comment")) + }} +| Begin_entity + $ {{ (* Set 'p_internal_subset' to 'false' until the matching 'end_entity' + * rule is parsed. This allows unrestricted usage of parameter entities + * within declarations of internal entities. + *) + let old_p_internal_subset = p_internal_subset in + p_internal_subset <- false; + }} + decl_xmldecl_then_rest() + {{ (* Restore the old value of 'p_internal_subset'. *) + p_internal_subset <- old_p_internal_subset; + () + }} +| begin_entid:Conditional_begin + $ {{ (* Check whether conditional sections are allowed at this position. *) + if p_internal_subset then + raise(WF_error("Restriction of the internal subset: Conditional sections not allowed")); + }} + Ignore* + cond:conditional_section() end_entid:Conditional_end + {{ (* Check whether Conditional_begin and Conditional_end are in the same + * entity. (This restriction is explained in the file SPECS.) + *) + if begin_entid != end_entid then + raise(Validation_error("The first and the last token of conditional sections must be in the same entity (additional restriction of this parser)")); + }} + ? {{ match !yy_position with + "end_entid" -> raise(WF_error("`>]>' expected")) + | "cond" -> raise(WF_error("INCLUDE or IGNORE expected")) + | _ -> raise(WF_error("Bad conditional section")) + }} + +/* The tokens INCLUDE/IGNORE are scanned as names, and the selection of the + * right parsing rule is dynamic. + * Note that parse_ignored_section is not defined by a grammar rule but + * by a conventional let-binding above. + */ + +conditional_section(): + include_or_ignore:Name + $ {{ let parsing_function = + match include_or_ignore with + "INCLUDE" -> parse_included_section + (* invoke rule "included_section" below *) + | "IGNORE" -> parse_ignored_section + (* invoke function "parse_ignored_section" *) + | _ -> raise(WF_error("INCLUDE or IGNORE expected")) + in + }} + [ parsing_function ] () + {{ () }} + ? {{ raise(WF_error("Bad conditional section")) }} + +included_section(): + Conditional_body declaration()* + {{ () }} +| Ignore Ignore* Conditional_body declaration()* + {{ () }} + + +/*************************** ELEMENT DECLARATIONS ********************/ + +elementdecl(): + /* parses . Puts the parsed element type as side-effect into + * dtd. + */ + decl_element_entid: Decl_element + $ {{ let extdecl = context.manager # current_entity_counts_as_external in + }} + ws1: Ignore Ignore* + name: Name + ws2: Ignore Ignore* + content_model: contentspec() + Ignore* + decl_rangle_entid: Decl_rangle + {{ + if decl_element_entid != decl_rangle_entid then + raise (Validation_error "Entities not properly nested with ELEMENT declaration"); + if extend_dtd then begin + let el = new dtd_element dtd name in + (* It is allowed that an precedes the corresponding + * . Because of this it is possible that there is already + * an element called 'name' in the DTD, and we only must set the content + * model of this element. + *) + try + dtd # add_element el; + el # set_cm_and_extdecl content_model extdecl; + with + Not_found -> (* means: there is already an element 'name' *) + let el' = dtd # element name in + el' # set_cm_and_extdecl content_model extdecl; + (* raises Validation_error if el' already has a content model *) + end + }} + ? {{ match !yy_position with + ("ws1"|"ws2") -> raise(WF_error("Whitespace is missing")) + | "name" -> raise(WF_error("The name of the element is expected here")) + | "content_model" -> raise(WF_error("Content model expression expected")) + | "decl_rangle_entid" -> raise(WF_error("`>' expected")) + | _ -> raise(WF_error("Bad element type declaration")) + }} + +contentspec(): + /* parses a content model and returns it (type content_model_type) */ + name: Name /* EMPTY or ANY */ + {{ match name with + "EMPTY" -> Empty + | "ANY" -> Any + | _ -> raise(WF_error("EMPTY, ANY, or a subexpression expected")) + }} +| entid:Lparen Ignore* term:mixed_or_regexp(entid) + {{ term }} + ? {{ raise(WF_error("Bad content model expression")) }} + + +/* Many of the following rules have an lparen_entid argument. This is the + * internal ID of the entity containing the corresponding left parenthesis; + * by comparing it with the ID of the entity of the right parenthesis the + * contraint is implemented that both parentheses must be in the same entity. + */ + +mixed_or_regexp(lparen_entid): + re: choice_or_seq(lparen_entid) + {{ Regexp re }} +| m: mixed(lparen_entid) + {{ m }} + + +multiplier(): + /* returns one of the multiplier symbols (?,*,+) */ + Plus + {{ Plus }} +| Star + {{ Star }} +| Qmark + {{ Qmark }} + + +mixed (lparen_entid) : + Pcdata + Ignore* + material: mixed_alternatives_top() + {{ + let rest, rparen_entid = material in + if lparen_entid != rparen_entid then + raise (Validation_error "Entities not properly nested with parentheses"); + Mixed (MPCDATA :: rest) + }} + ? {{ raise(WF_error("Bad content model expression")) }} + + +mixed_alternatives_top(): + entid: Rparen + {{ [], entid }} +| entid: RparenStar + {{ [], entid }} +| Bar Ignore* name:Name Ignore* names:mixed_alternative()* entid:RparenStar + {{ + (MChild name :: names), entid + }} + ? {{ match !yy_position with + "name" -> raise(WF_error("Name expected")) + | "entid" -> raise(WF_error("`)*' expected")) + | _ -> raise(WF_error("Bad content model expression")) + }} + + +mixed_alternative() : + Bar Ignore* name:Name Ignore* + {{ MChild name }} + ? {{ match !yy_position with + "name" -> raise(WF_error("Name expected")) + | _ -> raise(WF_error("Bad content model expression")) + }} + + + +choice_or_seq (lparen_entid): + /* parses either a regular expression, or a mixed expression. Returns + * Mixed spec or Regexp spec (content_model_type). + * Which kind of expression (regexp or mixed) is being read is recognized + * after the first subexpression has been parsed; the other subexpressions + * must be of the same kind. + */ + re: cp() + Ignore* + factor: choice_or_seq_factor() + {{ + let (finalmark,subexpr), rparen_entid = factor in + if lparen_entid != rparen_entid then + raise (Validation_error "Entities not properly nested with parentheses"); + (* Check that the other subexpressions are "regexp", too, and + * merge them with the first. + *) + let re' = + match subexpr with + Alt [] -> re + | Alt alt -> Alt (re :: alt) + | Seq seq -> Seq (re :: seq) + | _ -> assert false + in + (* Interpret the finalmark. *) + match finalmark with + Ignore -> re' + | Plus -> Repeated1 re' + | Star -> Repeated re' + | Qmark -> Optional re' + | _ -> assert false + }} + ? {{ raise(WF_error("Bad content model expression")) }} + +choice_or_seq_factor(): + /* Parses "||...)" or ",,...)", both forms optionally + * followed by ?, *, or +. + * Returns ((finalmark, expr), rparen_entid), where + * - finalmark is the character after the right parenthesis or Ignore + * - expr is either + * Alt [] meaning that only ")" has been found + * Alt non_empty_list meaning that the subexpressions are separated by '|' + * Seq non_empty_list meaning that the subexpressions are separated by ',' + */ + entid:Rparen + {{ (Ignore, Alt []), entid }} +| entid:RparenPlus + {{ (Plus, Alt []), entid }} +| entid:RparenStar + {{ (Star, Alt []), entid }} +| entid:RparenQmark + {{ (Qmark, Alt []), entid }} +| Bar Ignore* re:cp() Ignore* factor:choice_or_seq_factor() + {{ + let (finalmark, subexpr), rparen_entid = factor in + begin match subexpr with + Alt [] -> (finalmark, (Alt [re])), rparen_entid + | Alt alt -> (finalmark, (Alt (re :: alt))), rparen_entid + | _ -> raise(WF_error("It is not allowed to mix alternatives and sequences")) + end + }} + ? {{ raise(WF_error("Bad content model expression")) }} +| Comma Ignore* re:cp() Ignore* factor:choice_or_seq_factor() + {{ + let (finalmark, subexpr), rparen_entid = factor in + begin match subexpr with + Alt [] -> (finalmark, (Seq [re])), rparen_entid + | Seq seq -> (finalmark, (Seq (re :: seq))), rparen_entid + | _ -> raise(WF_error("It is not allowed to mix alternatives and sequences")) + end + }} + ? {{ raise(WF_error("Bad content model expression")) }} + +cp(): + /* parse either a name, or a parenthesized subexpression "(...)" */ + name:Name m:multiplier()? + {{ match m with + None -> Child name + | Some Plus -> Repeated1 (Child name) + | Some Star -> Repeated (Child name) + | Some Qmark -> Optional (Child name) + | _ -> assert false + }} + ? {{ raise(WF_error("Bad content model expression")) }} +| entid:Lparen Ignore* m:choice_or_seq(entid) + {{ m }} + ? {{ raise(WF_error("Bad content model expression")) }} + + +/********************* ATTRIBUTE LIST DECLARATION ***********************/ + +attlistdecl(): + /* parses . Enters the attribute list in dtd as side- + * effect. + */ + decl_attlist_entid: Decl_attlist + $ {{ let extdecl = context.manager # current_entity_counts_as_external in + }} + ws1: Ignore Ignore* + el_name: Name + ws: Ignore? Ignore* + factor: attdef_factor() + {{ + let at_list, decl_rangle_entid = factor in + + if decl_attlist_entid != decl_rangle_entid then + raise (Validation_error "Entities not properly nested with ATTLIST declaration"); + + if not ws && at_list <> [] then begin + match at_list with + (name,_,_) :: _ -> + (* This is normally impossible, because the lexer demands + * some other token between two names. + *) + raise(WF_error("Whitespace is missing before `" ^ name ^ "'")); + | _ -> assert false + end; + + if extend_dtd then begin + let new_el = new dtd_element dtd el_name in + (* Note that it is allowed that precedes the corresponding + * declaration. In this case we add the element declaration + * already to the DTD but leave the content model unspecified. + *) + let el = + try + dtd # add_element new_el; + new_el + with + Not_found -> (* already added *) + let old_el = dtd # element el_name in + if old_el # attribute_names <> [] then + config.warner # warn ("More than one ATTLIST declaration for element type `" ^ + el_name ^ "'"); + old_el + in + List.iter + (fun (a_name, a_type, a_default) -> + el # add_attribute a_name a_type a_default extdecl) + at_list + end + }} + ? {{ match !yy_position with + "ws1" -> raise(WF_error("Whitespace is missing after ATTLIST")) + | "el_name" -> raise(WF_error("The name of the element is expected here")) + | "factor" -> raise(WF_error("Another attribute name or `>' expected")) + | _ -> raise(WF_error("Bad attribute declaration")) + }} + + +attdef_factor(): + /* parses a list of triples and returns the + * list as (string * att_type * att_default) list. + */ + attdef:attdef() ws:Ignore? Ignore* factor:attdef_factor() + {{ + let attdef_rest, decl_rangle_entid = factor in + if not ws && attdef_rest <> [] then begin + match attdef_rest with + (name,_,_) :: _ -> + raise(WF_error("Missing whitespace before `" ^ name ^ "'")); + | _ -> assert false + end; + (attdef :: attdef_rest), decl_rangle_entid }} + ? {{ match !yy_position with + | "factor" -> raise(WF_error("Another attribute name or `>' expected")) + | _ -> raise(WF_error("Bad attribute declaration")) + }} +| entid:Decl_rangle + {{ [], entid }} + + +attdef(): + /* Parses a single triple */ + name: Name + ws1: Ignore Ignore* + tp: atttype() + ws2: Ignore Ignore* + default: defaultdecl() + {{ (name,tp,default) }} + ? {{ match !yy_position with + ("ws1"|"ws2") -> raise(WF_error("Whitespace is missing")) + | "tp" -> raise(WF_error("Type of attribute or `(' expected")) + | "default" -> raise(WF_error("#REQUIRED, #IMPLIED, #FIXED or a string literal expected")) + | _ -> raise(WF_error("Bad attribute declaration")) + }} + +atttype(): + /* Parses an attribute type and returns it as att_type. */ + name: Name + $ {{ let followup = + if name = "NOTATION" then + parse_notation + else + parse_never + in + }} + nota: [followup]()? + {{ + match name with + "CDATA" -> A_cdata + | "ID" -> A_id + | "IDREF" -> A_idref + | "IDREFS" -> A_idrefs + | "ENTITY" -> A_entity + | "ENTITIES" -> A_entities + | "NMTOKEN" -> A_nmtoken + | "NMTOKENS" -> A_nmtokens + | "NOTATION" -> + (match nota with + None -> raise(WF_error("Error in NOTATION type (perhaps missing whitespace after NOTATION?)")) + | Some n -> n + ) + | _ -> raise(WF_error("One of CDATA, ID, IDREF, IDREFS, ENTITY, ENTITIES, NMTOKEN, NMTOKENS, NOTATION, or a subexpression expected")) + }} + ? {{ raise(WF_error("Bad attribute declaration (perhaps missing whitespace after NOTATION)")) }} + +| Lparen + Ignore* + name: name_or_nametoken() + Ignore* + names: nmtoken_factor()* + rp: Rparen + /* Enumeration */ + {{ A_enum(name :: names) }} + ? {{ match !yy_position with + "name" -> raise(WF_error("Name expected")) + | "names" -> raise(WF_error("`|' and more names expected, or `)'")) + | "rp" -> raise(WF_error("`|' and more names expected, or `)'")) + | _ -> raise(WF_error("Bad enumeration type")) + }} + + +never(): + /* The always failing rule */ + $ {{ raise Not_found; }} + Doctype /* questionable */ + {{ A_cdata (* Does not matter *) + }} + + +notation(): + Ignore Ignore* + lp: Lparen + Ignore* + name: Name + Ignore* + names: notation_factor()* + rp: Rparen + {{ A_notation(name :: names) }} + ? {{ match !yy_position with + "lp" -> raise(WF_error("`(' expected")) + | "name" -> raise(WF_error("Name expected")) + | "names" -> raise(WF_error("`|' and more names expected, or `)'")) + | "rp" -> raise(WF_error("`|' and more names expected, or `)'")) + | _ -> raise(WF_error("Bad NOTATION type")) + }} + + +notation_factor(): + /* Parse "|" and return the name */ + Bar Ignore* name:Name Ignore* + {{ name }} + ? {{ match !yy_position with + "name" -> raise(WF_error("Name expected")) + | _ -> raise(WF_error("Bad NOTATION type")) + }} + +nmtoken_factor(): + /* Parse "|" and return the nmtoken */ + Bar Ignore* n:name_or_nametoken() Ignore* + {{ n }} + ? {{ match !yy_position with + "n" -> raise(WF_error("Nametoken expected")) + | _ -> raise(WF_error("Bad enumeration type")) + }} + + +name_or_nametoken(): + n:Name {{ n }} +| n:Nametoken {{ n }} + + +/* The default values must be expanded and normalized. This has been implemented + * by the function expand_attvalue. + */ + + +defaultdecl(): + /* Parse the default value for an attribute and return it as att_default */ + Required + {{ D_required }} +| Implied + {{ D_implied }} +| Fixed ws:Ignore Ignore* str:Unparsed_string + {{ D_fixed (expand_attvalue lexerset dtd str config.warner false) }} + ? {{ match !yy_position with + "ws" -> raise(WF_error("Whitespace is missing after #FIXED")) + | "str" -> raise(WF_error("String literal expected")) + | _ -> raise(WF_error("Bad #FIXED default value")) + }} +| str:Unparsed_string + {{ D_default (expand_attvalue lexerset dtd str config.warner false) }} + + +/**************************** ENTITY DECLARATION ***********************/ + +entitydecl(decl_entity_entid): + /* parses everything _after_ . The parsed + * entity declaration is entered into the dtd object as side-effect. + */ + name: Name + $ {{ let extdecl = context.manager # current_entity_counts_as_external in + }} + ws: Ignore Ignore* + material: entitydef() + Ignore* + decl_rangle_entid: Decl_rangle + /* A general entity */ + {{ + if decl_entity_entid != decl_rangle_entid then + raise (Validation_error "Entities not properly nested with ENTITY declaration"); + let en = + (* Distinguish between + * - internal entities + * - external entities + * - NDATA (unparsed) entities + *) + match material with + (Some s, None, None) -> + new internal_entity dtd name config.warner s p_internal_subset + config.errors_with_line_numbers false config.encoding + | (None, Some xid, None) -> + new external_entity (resolver # clone) dtd name config.warner + xid false config.errors_with_line_numbers + config.encoding + + | (None, Some xid, Some n) -> + (new ndata_entity name xid n config.encoding :> entity) + | _ -> assert false + in + dtd # add_gen_entity en extdecl + }} + ? {{ match !yy_position with + "ws" -> raise(WF_error("Whitespace is missing")) + | "material" -> raise(WF_error("String literal or identifier expected")) + | "decl_rangle_entid" -> raise(WF_error("`>' expected")) + | _ -> raise(WF_error("Bad entity declaration")) + }} + +| Percent + $ {{ let extdecl = context.manager # current_entity_counts_as_external in + }} + ws1: Ignore Ignore* + name: Name + ws2: Ignore Ignore* + material: pedef() + Ignore* + decl_rangle_entid: Decl_rangle + /* A parameter entity */ + {{ + if decl_entity_entid != decl_rangle_entid then + raise (Validation_error "Entities not properly nested with ENTITY declaration"); + let en = + (* Distinguish between internal and external entities *) + match material with + (Some s, None) -> + new internal_entity dtd name config.warner s p_internal_subset + config.errors_with_line_numbers true config.encoding + | (None, Some xid) -> + new external_entity (resolver # clone) dtd name config.warner + xid true config.errors_with_line_numbers + config.encoding + | _ -> assert false + in + + (* The following two lines force that even internal entities count + * as external (for the standalone check) if the declaration of + * the internal entity occurs in an external entity. + *) + if extdecl then + en # set_counts_as_external; + + dtd # add_par_entity en; + }} + ? {{ match !yy_position with + ("ws1"|"ws2") -> raise(WF_error("Whitespace is missing")) + | "material" -> raise(WF_error("String literal or identifier expected")) + | "decl_rangle_entid" -> raise(WF_error("`>' expected")) + | _ -> raise(WF_error("Bad entity declaration")) + }} + + +entitydef(): + /* parses the definition value of a general entity. Returns either: + * - (Some s, None, None) meaning the definition of an internal entity + * with (literal) value s has been found + * - (None, Some x, None) meaning that an external parsed entity with + * external ID x has been found + * - (None, Some x, Some n) meaning that an unparsed entity with + * external ID x and notations n has been found + */ + str:Unparsed_string + {{ Some str, None, None }} +| id:external_id() ws:Ignore? Ignore* decl:ndatadecl()? + {{ if not ws && decl <> None then + raise(WF_error("Whitespace missing before `NDATA'")); + None, Some id, decl + }} + + +pedef(): + /* parses the definition value of a parameter entity. Returns either: + * - (Some s, None) meaning that the definition of an internal entity + * with (literal) value s has been found + * - (None, Some x) meaning that an external ID x has been found + */ + str:Unparsed_string + {{ Some str, None }} +| id:external_id() + {{ None, Some id }} + + +ndatadecl(): + /* Parses either NDATA "string" or the empty string; returns Some "string" + * in the former, None in the latter case. + */ + ndata:Name ws:Ignore Ignore* name:Name + {{ if ndata = "NDATA" then + name + else + raise(WF_error("NDATA expected")) + }} + ? {{ match !yy_position with + "ws" -> raise(WF_error("Whitespace is missing after NDATA")) + | "name" -> raise(WF_error("Name expected")) + | _ -> raise(WF_error("Bad NDATA declaration")) + }} + +/**************************** NOTATION DECLARATION *******************/ + +notationdecl(): + /* parses and enters the notation declaration into the + * dtd object as side-effect + */ + decl_notation_entid: Decl_notation + ws1: Ignore Ignore* + name: Name + ws2: Ignore Ignore* + sys_or_public: Name /* SYSTEM or PUBLIC */ + ws3: Ignore Ignore* + str1: Unparsed_string + ws: Ignore? Ignore* + str2: Unparsed_string? + Ignore* + decl_rangle_entid: Decl_rangle + {{ + if decl_notation_entid != decl_rangle_entid then + raise (Validation_error "Entities not properly nested with NOTATION declaration"); + let xid = + (* Note that it is allowed that PUBLIC is only followed by one + * string literal + *) + match sys_or_public with + "SYSTEM" -> + if str2 <> None then raise(WF_error("SYSTEM must be followed only by one argument")); + System (recode_utf8 str1) + | "PUBLIC" -> + begin match str2 with + None -> + check_public_id str1; + Public(recode_utf8 str1,"") + | Some p -> + if not ws then + raise(WF_error("Missing whitespace between the string literals of the `PUBLIC' id")); + check_public_id str1; + Public(recode_utf8 str1, recode_utf8 p) + end + | _ -> raise(WF_error("PUBLIC or SYSTEM expected")) + in + if extend_dtd then begin + let no = new dtd_notation name xid config.encoding in + dtd # add_notation no + end + }} + ? {{ match !yy_position with + ("ws1"|"ws2"|"ws3") -> raise(WF_error("Whitespace is missing")) + | "name" -> raise(WF_error("Name expected")) + | "sys_or_public" -> raise(WF_error("SYSTEM or PUBLIC expected")) + | ("str1"|"str2") -> raise(WF_error("String literal expected")) + | "decl_rangle_entid" -> raise(WF_error("`>' expected")) + | _ -> raise(WF_error("Bad NOTATION declaration")) + }} + +/****************************** ELEMENTS **************************/ + +/* In the following rules, the number of error rules is reduced to + * improve the performance of the parser. + */ + + +contents_start(): + /* parses ... misc*, i.e. exactly one element followed + * optionally by white space or processing instructions. + * The element is entered into the global variables as follows: + * - If elstack is non-empty, the parsed element is added as new child to + * the top element of the stack. + * - If elstack is empty, the root_examplar object is modified rather than + * that a new element is created. If additionally the variable root is + * None, it is assigned Some root_examplar. + * Note that the modification of the root_exemplar is done by the method + * internal_init. + * The reason why the root element is modified rather than newly created + * is a typing requirement. It must be possible that the class of the root + * is derived from the original class element_impl, i.e. the user must be + * able to add additional methods. If we created a new root object, we + * would have to denote to which class the new object belongs; the root + * would always be an 'element_impl' object (and not a derived object). + * If we instead cloned an exemplar object and modified it by the + * "create" method, the root object would belong to the same class as the + * exemplar (good), but the type of the parsing function would always + * state that an 'element_impl' was created (because we can pass the new + * object only back via a global variable). The only solution is to + * modify the object that has been passed to the parsing function directly. + */ + $ {{ dtd <- transform_dtd dtd; }} + start_tag() content()* + {{ () }} + + +content(): + /* parses: start tags, end tags, content, or processing + * instructions. That the tags are properly nested is dynamically checked. + * As result, recognized elements are added to their parent elements, + * content is added to the element containing it, and processing instructions + * are entered into the element embracing them. (All as side-effects.) + */ + start_tag() + {{ () }} +| end_tag() + {{ () }} +| char_data() + {{ () }} +| cref() + {{ () }} +| pi() + {{ () }} +| entity_ref() + {{ () }} +| comment() + {{ () }} + + +entity_ref(): + Begin_entity eref_xmldecl_then_rest() + {{ if n_tags_open = 0 then + raise(WF_error("Entity reference not allowed here")) + }} + + +/* See comment for doc_mldecl_then_misc_then_prolog_then_rest. */ + +eref_xmldecl_then_rest(): + pl:PI_xml + $ {{ context.manager # current_entity # process_xmldecl pl; + }} + content()* End_entity + {{ () }} + +| $ {{ context.manager # current_entity # process_missing_xmldecl; }} + content() content()* End_entity + {{ () }} + +| $ {{ context.manager # current_entity # process_missing_xmldecl; }} + End_entity + {{ () }} + + +start_tag(): + /* parses or . + * + * EFFECT: If elstack is non-empty, the element is added to the + * top element of the stack as new child, and the element + * is pushed on the stack. If elstack is empty, the root_exemplar is + * modified and gets the parsed name and attribute list. The root_exemplar + * is pushed on the stack. If additionally the variable root is empty, too, + * this variable is initialized. + * If the form has been parsed, no element is pushed + * on the stack. + */ + tag: Tag_beg + $ {{ let position = + if config.store_element_positions then + Some(context.manager # position) + else + None + in + }} + ws: Ignore? Ignore* + attlist: attribute()* + emptiness: start_tag_rangle() + /* Note: it is guaranteed that there is whitespace between Tag_beg and + * the name of the first attribute, because there must be some separator. + * So we need not to check ws! + */ + {{ + let rec check_attlist al = + match al with + (nv1, num1) :: al' -> + if not num1 && al' <> [] then begin + match al with + ((n1,_),_) :: ((n2,_),_) :: _ -> + raise(WF_error("Whitespace is missing between attributes `" ^ + n1 ^ "' and `" ^ n2 ^ "'")) + | _ -> assert false + end; + check_attlist al' + | [] -> () + in + check_attlist attlist; + + let name, tag_beg_entid = tag in + let attlist' = List.map (fun (nv,_) -> nv) attlist in + let d = + create_element_node ?position:position spec dtd name attlist' in + + begin match id_index with + None -> () + | Some idx -> + (* Put the ID attribute into the index, if present *) + begin try + let v = d # id_attribute_value in (* may raise Not_found *) + idx # add v d (* may raise ID_not_unique *) + with + Not_found -> + (* No ID attribute *) + () + | ID_not_unique -> + (* There is already an ID with the same value *) + raise(Validation_error("ID not unique")) + end + end; + + if n_tags_open = 0 then begin + if root = None then begin + (* We have found the begin tag of the root element. *) + if config.enable_super_root_node then begin + (* The user wants the super root instead of the real root. + * The real root element becomes the child of the VR. + *) + (* Assertion: self # current is the super root *) + assert (self # current # node_type = T_super_root); + root <- Some (self # current); + self # current # add_node d; + doc # init_root (self # current); + end + else begin + (* Normal behaviour: The user wants to get the real root. *) + root <- Some d; + doc # init_root d; + end; + end + else + (* We have found a second topmost element. This is illegal. *) + raise(WF_error("Document must consist of only one toplevel element")) + end + else begin + (* We have found some inner begin tag. *) + self # save_data; (* Save outstanding data material first *) + self # current # add_node d + end; + + if emptiness then + (* An empty tag like . *) + d # local_validate ~use_dfa:config.validate_by_dfa () + else begin + (* A non-empty tag. *) + Stack.push (d, tag_beg_entid) elstack; + n_tags_open <- n_tags_open + 1; + end; + }} + ? {{ match !yy_position with + "attlist" -> raise(WF_error("Bad attribute list")) + | "emptiness" -> raise(WF_error("`>' or `/>' expected")) + | _ -> raise(WF_error("Bad start tag")) + }} + + +attribute(): + /* Parses name="value" */ + n:Name Ignore* Eq Ignore* v:attval() ws:Ignore? Ignore* + {{ (n,v), ws }} + + +attval(): + v:Attval + {{ expand_attvalue lexerset dtd v config.warner true }} +| v:Attval_nl_normalized + {{ expand_attvalue lexerset dtd v config.warner false }} + + +start_tag_rangle(): + Rangle {{ false }} +| Rangle_empty {{ true }} + + +end_tag(): + /* parses . + * Pops the top element from the elstack and checks if it is the same + * element. + */ + tag:Tag_end Ignore* Rangle + {{ let name, tag_end_entid = tag in + if n_tags_open = 0 then + raise(WF_error("End-tag without start-tag")); + + self # save_data; (* Save outstanding data material first *) + + let x, tag_beg_entid = Stack.pop elstack in + let x_name = + match x # node_type with + | T_element n -> n + | _ -> assert false + in + if name <> x_name then + raise(WF_error("End-tag does not match start-tag")); + if tag_beg_entid != tag_end_entid then + raise(WF_error("End-tag not in the same entity as the start-tag")); + x # local_validate ~use_dfa:config.validate_by_dfa (); + + n_tags_open <- n_tags_open - 1; + + assert (n_tags_open >= 0); + + }} + +char_data(): + /* Parses any literal characters not otherwise matching, and adds the + * characters to the top element of elstack. + * If elstack is empty, it is assumed that there is no surrounding + * element, and any non-white space character is forbidden. + */ + data:CharData + {{ + if n_tags_open = 0 then + (* only white space is allowed *) + self # only_whitespace data + else + self # collect_data data + (* We collect the chardata material until the next end tag is + * reached. Then the collected material will concatenated and + * stored as a single T_data node (see end_tag rule above) + * using save_data. + *) + }} +| data:Cdata + {{ + if n_tags_open = 0 then + raise (WF_error("CDATA section not allowed here")); + self # collect_data data + (* Also collect CDATA material *) + }} + +cref(): + /* Parses &#...; and adds the character to the top element of elstack. */ + code:CRef + {{ + if n_tags_open = 0 then + (* No surrounding element: character references are not allowed *) + raise(WF_error("Character reference not allowed here")); + self # collect_data (character config.encoding config.warner code) + (* Also collect character references *) + }} + +pi(): + /* Parses (but not ). + * If there is a top element in elstack, the processing instruction is added + * to this element. + */ + pi: PI + {{ + let position = + if config.store_element_positions then + Some(context.manager # position) + else + None + in + let target,value = pi in + + if n_tags_open = 0 & not config.enable_super_root_node + then + doc # add_pinstr (new proc_instruction target value config.encoding) + else begin + (* Special case: if processing instructions are processed inline, + * they are wrapped into T_pinstr nodes. + *) + if config.enable_pinstr_nodes then begin + self # save_data; (* Save outstanding data material first *) + let pinstr = new proc_instruction target value config.encoding in + let wrapper = create_pinstr_node + ?position:position spec dtd pinstr in + wrapper # local_validate(); (* succeeds always *) + self # current # add_node wrapper; + end + else + (* Normal behaviour: Add the PI to the parent element. *) + self # current # add_pinstr + (new proc_instruction target value config.encoding) + end + }} + + +comment(): + /* Parses + */ + Comment_begin + $ {{ + let position = + if config.enable_comment_nodes && config.store_element_positions then + Some(context.manager # position) + else + None + in + }} + mat: Comment_material* + ce: Comment_end + {{ + if config.enable_comment_nodes then begin + self # save_data; (* Save outstanding data material first *) + let comment_text = String.concat "" mat in + let wrapper = create_comment_node + ?position:position spec dtd comment_text in + wrapper # local_validate(); (* succeeds always *) + self # current # add_node wrapper; + end + }} + ? {{ match !yy_position with + | "ce" -> raise(WF_error("`-->' expected")) + | _ -> raise(WF_error("Bad comment")) + }} + + +%% + (* The method "parse" continues here... *) + + try + match start_symbol with + Ext_document -> + parse_ext_document context.current context.get_next + | Ext_declarations -> + parse_ext_declarations context.current context.get_next + | Ext_element -> + parse_ext_element context.current context.get_next + with + Not_found -> + raise Parsing.Parse_error + + (*********** The method "parse" ends here *************) + + +(**********************************************************************) + +(* Here ends the class definition: *) +end +;; + +(**********************************************************************) + +open Pxp_reader;; + + +class default_ext = + object(self) + val mutable node = (None : ('a extension node as 'a) option) + method clone = {< >} + method node = + match node with + None -> + assert false + | Some n -> n + method set_node n = + node <- Some n + end +;; + + +let default_extension = new default_ext;; + +let default_spec = + make_spec_from_mapping + ~super_root_exemplar: (new element_impl default_extension) + ~comment_exemplar: (new element_impl default_extension) + ~default_pinstr_exemplar: (new element_impl default_extension) + ~data_exemplar: (new data_impl default_extension) + ~default_element_exemplar: (new element_impl default_extension) + ~element_mapping: (Hashtbl.create 1) + () +;; + + +let idref_pass id_index root = + let error t att value = + let name = + match t # node_type with + T_element name -> name + | _ -> assert false + in + let text = + "Attribute `" ^ att ^ "' of element `" ^ name ^ + "' refers to unknown ID `" ^ value ^ "'" in + let pos_ent, pos_line, pos_col = t # position in + if pos_line = 0 then + raise(Validation_error text) + else + raise(At("In entity " ^ pos_ent ^ " at line " ^ + string_of_int pos_line ^ ", position " ^ string_of_int pos_col ^ + ":\n", + Validation_error text)) + in + + let rec check_tree t = + let idref_atts = t # idref_attribute_names in + List.iter + (fun att -> + match t # attribute att with + Value s -> + begin try ignore(id_index # find s) with + Not_found -> + error t att s + end + | Valuelist l -> + List.iter + (fun s -> + try ignore(id_index # find s) with + Not_found -> + error t att s + ) + l + | Implied_value -> () + ) + idref_atts; + List.iter check_tree (t # sub_nodes) + in + check_tree root +;; + + +exception Return_DTD of dtd;; + (* Used by extract_dtd_from_document_entity to jump out of the parser *) + + +let call_parser ~configuration:cfg + ~source:src + ~dtd + ~extensible_dtd + ~document:doc + ~specification:spec + ~process_xmldecl + ~transform_dtd + ~(id_index : 'ext #index option) + ~use_document_entity + ~entry + ~init_lexer = + let e = cfg.errors_with_line_numbers in + let w = cfg.warner in + let r, en = + match src with + Entity(m,r') -> r', m dtd + | ExtID(xid,r') -> r', + if use_document_entity then + new document_entity + r' dtd "[toplevel]" w xid e + cfg.encoding + else + new external_entity + r' dtd "[toplevel]" w xid false e + cfg.encoding + in + r # init_rep_encoding cfg.encoding; + r # init_warner w; + en # set_debugging_mode (cfg.debugging_mode); + let pobj = + new parser_object + doc + dtd + extensible_dtd + cfg + r + spec + process_xmldecl + transform_dtd + (id_index :> 'ext index option) + in + let mgr = new entity_manager en in + en # open_entity true init_lexer; + begin try + let context = make_context mgr in + pobj # parse context entry; + ignore(en # close_entity); + with + Return_DTD d -> + ignore(en # close_entity); + raise(Return_DTD d) + | error -> + ignore(en # close_entity); + r # close_all; + let pos = mgr # position_string in + raise (At(pos, error)) + end; + if cfg.idref_pass then begin + match id_index with + None -> () + | Some idx -> + ( match pobj # root with + None -> () + | Some root -> + idref_pass idx root; + ) + end; + pobj + + +let parse_dtd_entity cfg src = + (* Parse a DTD given as separate entity. *) + let dtd = new dtd cfg.warner cfg.encoding in + let doc = new document cfg.warner in + let pobj = + call_parser + ~configuration:cfg + ~source:src + ~dtd:dtd + ~extensible_dtd:true (* Extend the DTD by parsed declarations *) + ~document:doc + ~specification:default_spec + ~process_xmldecl:false (* The XML declaration is ignored + * (except 'encoding') + *) + ~transform_dtd:(fun x -> x) (* Do not transform the DTD *) + ~id_index: None + ~use_document_entity:false + ~entry:Ext_declarations (* Entry point of the grammar *) + ~init_lexer:Declaration (* The initially used lexer *) + in + dtd # validate; + if cfg.accept_only_deterministic_models then dtd # only_deterministic_models; + dtd +;; + + +let parse_content_entity ?id_index cfg src dtd spec = + (* Parse an element given as separate entity *) + dtd # validate; (* ensure that the DTD is valid *) + if cfg.accept_only_deterministic_models then dtd # only_deterministic_models; + let doc = new document cfg.warner in + let pobj = + call_parser + ~configuration:cfg + ~source:src + ~dtd:dtd + ~extensible_dtd:true (* Extend the DTD by parsed declarations *) + ~document:doc + ~specification:spec + ~process_xmldecl:false (* The XML declaration is ignored + * (except 'encoding') + *) + ~transform_dtd:(fun x -> x) (* Do not transform the DTD *) + ~id_index:(id_index :> 'ext index option) + ~use_document_entity:false + ~entry:Ext_element (* Entry point of the grammar *) + ~init_lexer:Content (* The initially used lexer *) + in + match pobj # root with + Some r -> r + | None -> raise(WF_error("No root element")) +;; + + +let parse_wfcontent_entity cfg src spec = + let dtd = new dtd cfg.warner cfg.encoding in + dtd # allow_arbitrary; + let doc = new document cfg.warner in + let pobj = + call_parser + ~configuration:cfg + ~source:src + ~dtd:dtd + ~extensible_dtd:false (* Do not extend the DTD *) + ~document:doc + ~specification:spec + ~process_xmldecl:false (* The XML declaration is ignored + * (except 'encoding') + *) + ~transform_dtd:(fun x -> x) (* Do not transform the DTD *) + ~id_index:None + ~use_document_entity:false + ~entry:Ext_element (* Entry point of the grammar *) + ~init_lexer:Content (* The initially used lexer *) + in + match pobj # root with + Some r -> r + | None -> raise(WF_error("No root element")) +;; + + +let iparse_document_entity ?(transform_dtd = (fun x -> x)) + ?id_index + cfg0 src spec p_wf = + (* Parse an element given as separate entity *) + (* p_wf: 'true' if in well-formedness mode, 'false' if in validating mode *) + let cfg = { cfg0 with + recognize_standalone_declaration = + cfg0.recognize_standalone_declaration && (not p_wf) + } in + let dtd = new dtd cfg.warner cfg.encoding in + if p_wf then + dtd # allow_arbitrary; + let doc = new document cfg.warner in + let pobj = + call_parser + ~configuration:cfg + ~source:src + ~dtd:dtd + ~extensible_dtd:(not p_wf) (* Extend the DTD by parsed declarations + * only if in validating mode + *) + ~document:doc + ~specification:spec + ~process_xmldecl:true (* The XML declaration is processed *) + (* TODO: change to 'not p_wf' ? *) + ~transform_dtd:(fun dtd -> + let dtd' = transform_dtd dtd in + if cfg.accept_only_deterministic_models then + dtd' # only_deterministic_models; + dtd') + + ~id_index:(id_index :> 'ext index option) + ~use_document_entity:true + ~entry:Ext_document (* Entry point of the grammar *) + ~init_lexer:Document (* The initially used lexer *) + in + pobj # doc +;; + + +let parse_document_entity ?(transform_dtd = (fun x -> x)) + ?id_index + cfg src spec = + iparse_document_entity + ~transform_dtd:transform_dtd + ?id_index:(id_index : 'ext #index option :> 'ext index option) + cfg src spec false;; + +let parse_wfdocument_entity cfg src spec = + iparse_document_entity cfg src spec true;; + +let extract_dtd_from_document_entity cfg src = + let transform_dtd dtd = raise (Return_DTD dtd) in + try + let doc = parse_document_entity + ~transform_dtd:transform_dtd + cfg + src + default_spec in + (* Should not happen: *) + doc # dtd + with + Return_DTD dtd -> + (* The normal case: *) + dtd +;; + + +let default_config = + let w = new drop_warnings in + { warner = w; + errors_with_line_numbers = true; + enable_pinstr_nodes = false; + enable_super_root_node = false; + enable_comment_nodes = false; + encoding = `Enc_iso88591; + recognize_standalone_declaration = true; + store_element_positions = true; + idref_pass = false; + validate_by_dfa = true; + accept_only_deterministic_models = true; + debugging_mode = false; + } + + +class [ 'ext ] hash_index = +object + constraint 'ext = 'ext node #extension + val ht = (Hashtbl.create 100 : (string, 'ext node) Hashtbl.t) + method add s n = + try + ignore(Hashtbl.find ht s); + raise ID_not_unique + with + Not_found -> + Hashtbl.add ht s n + + method find s = Hashtbl.find ht s + method index = ht +end + + +(* ====================================================================== + * History: + * + * $Log$ + * Revision 1.1 2000/11/17 09:57:29 lpadovan + * Initial revision + * + * Revision 1.14 2000/08/26 23:23:14 gerd + * Bug: from_file must not interpret the file name as URL path. + * Bug: When PI and comment nodes are generated, the collected data + * material must be saved first. + * + * Revision 1.13 2000/08/19 21:30:03 gerd + * Improved the error messages of the parser + * + * Revision 1.12 2000/08/18 20:16:25 gerd + * Implemented that Super root nodes, pinstr nodes and comment + * nodes are included into the document tree. + * + * Revision 1.11 2000/08/14 22:24:55 gerd + * Moved the module Pxp_encoding to the netstring package under + * the new name Netconversion. + * + * Revision 1.10 2000/07/23 02:16:33 gerd + * Support for DFAs. + * + * Revision 1.9 2000/07/14 13:57:29 gerd + * Added the id_index feature. + * + * Revision 1.8 2000/07/09 17:52:45 gerd + * New implementation for current_data. + * The position of elements is stored on demand. + * + * Revision 1.7 2000/07/09 01:00:35 gerd + * Improvement: It is now guaranteed that only one data node + * is added for consecutive character material. + * + * Revision 1.6 2000/07/08 16:27:29 gerd + * Cleaned up the functions calling the parser. + * New parser argument: transform_dtd. + * Implementations for 'extract_dtd_from_document_entity' and + * 'parse_wfcontent_entity'. + * + * Revision 1.5 2000/07/06 23:05:18 gerd + * Initializations of resolvers were missing. + * + * Revision 1.4 2000/07/06 22:11:01 gerd + * Fix: The creation of the non-virtual root element is protected + * in the same way as the virtual root element. + * + * Revision 1.3 2000/07/04 22:15:18 gerd + * Change: Using the new resolver capabilities. + * Still incomplete: the new extraction and parsing functions. + * + * Revision 1.2 2000/06/14 22:19:06 gerd + * Added checks such that it is impossible to mix encodings. + * + * Revision 1.1 2000/05/29 23:48:38 gerd + * Changed module names: + * Markup_aux into Pxp_aux + * Markup_codewriter into Pxp_codewriter + * Markup_document into Pxp_document + * Markup_dtd into Pxp_dtd + * Markup_entity into Pxp_entity + * Markup_lexer_types into Pxp_lexer_types + * Markup_reader into Pxp_reader + * Markup_types into Pxp_types + * Markup_yacc into Pxp_yacc + * See directory "compatibility" for (almost) compatible wrappers emulating + * Markup_document, Markup_dtd, Markup_reader, Markup_types, and Markup_yacc. + * + * ====================================================================== + * Old logs from markup_yacc.m2y: + * + * Revision 1.9 2000/05/29 21:14:57 gerd + * Changed the type 'encoding' into a polymorphic variant. + * + * Revision 1.8 2000/05/27 19:26:19 gerd + * Change: The XML declaration is interpreted right after + * it has been parsed (no longer after the document): new function + * check_and_parse_xmldecl. + * When elements, attributes, and entities are declared + * it is stored whether the declaration happens in an external + * entity (for the standalone check). + * The option recognize_standalone_declaration is interpreted. + * + * Revision 1.7 2000/05/20 20:31:40 gerd + * Big change: Added support for various encodings of the + * internal representation. + * + * Revision 1.6 2000/05/14 21:51:24 gerd + * Change: Whitespace is handled by the grammar, and no longer + * by the entity. + * + * Revision 1.5 2000/05/14 17:50:54 gerd + * Updates because of changes in the token type. + * + * Revision 1.4 2000/05/11 22:09:17 gerd + * Fixed the remaining problems with conditional sections. + * This seems to be also a weakness of the XML spec! + * + * Revision 1.3 2000/05/09 00:02:44 gerd + * Conditional sections are now recognized by the parser. + * There seem some open questions; see the TODO comments! + * + * Revision 1.2 2000/05/08 22:01:44 gerd + * Introduced entity managers (see markup_entity.ml). + * The XML declaration is now recognized by the parser. If such + * a declaration is found, the method process_xmldecl of the currently + * active entity is called. If the first token is not an XML declaration, + * the method process_missing_xmldecl is called instead. + * Some minor changes. + * + * Revision 1.1 2000/05/06 23:21:49 gerd + * Initial revision. + * + * + * ====================================================================== + * + * COPIED FROM REVISION 1.19 OF markup_yacc.mly + * + * Revision 1.19 2000/05/01 15:20:08 gerd + * "End tag matches start tag" is checked before "End tag in the + * same entity as start tag". + * + * Revision 1.18 2000/04/30 18:23:08 gerd + * Bigger change: Introduced the concept of virtual roots. First, + * this reduces the number of checks. Second, it makes it possible to + * return the virtual root to the caller instead of the real root (new + * config options 'virtual_root' and 'processing_instructions_inline'). + * Minor changes because of better CR/CRLF handling. + * + * Revision 1.17 2000/03/13 23:47:46 gerd + * Updated because of interface changes. (See markup_yacc_shadow.mli + * rev. 1.8) + * + * Revision 1.16 2000/01/20 20:54:43 gerd + * New config.errors_with_line_numbers. + * + * Revision 1.15 1999/12/17 22:27:58 gerd + * Bugfix: The value of 'p_internal_subset' (an instance + * variable of the parser object) is to true when the internal subset + * begins, and is set to false when this subset ends. The error was + * that references to external entities within this subset did not + * set 'p_internal_subset' to false; this is now corrected by introducing + * the 'p_internal_subset_stack'. + * This is a typical example of how the code gets more and + * more complicated and that it is very difficult to really understand + * what is going on. + * + * Revision 1.14 1999/11/09 22:23:37 gerd + * Removed the invocation of "init_dtd" of the root document. + * This method is no longer available. The DTD is also passed to the + * document object by the root element, so nothing essential changes. + * + * Revision 1.13 1999/10/25 23:37:09 gerd + * Bugfix: The warning "More than one ATTLIST declaration for element + * type ..." is only generated if an ATTLIST is found while there are already + * attributes for the element. + * + * Revision 1.12 1999/09/01 23:08:38 gerd + * New frontend function: parse_wf_document. This simply uses + * a DTD that allows anything, and by the new parameter "extend_dtd" it is + * avoided that element, attlist, and notation declarations are added to this + * DTD. The idea is that this function simulates a well-formedness parser. + * Tag_beg, Tag_end carry the entity_id. The "elstack" stores the + * entity_id of the stacked tag. This was necessary because otherwise there + * are some examples to produces incorrectly nested elements. + * p_internal_subset is a variable that stores whether the internal + * subset is being parsed. This is important beacause entity declarations in + * internal subsets are not allowed to contain parameter references. + * It is checked if the "elstack" is empty after all has been parsed. + * Processing instructions outside DTDs and outside elements are now + * added to the document. + * The rules of mixed and regexp style content models have been + * separated. The code is now much simpler. + * Entity references outside elements are detected and rejected. + * + * Revision 1.11 1999/09/01 16:26:08 gerd + * Improved the quality of error messages. + * + * Revision 1.10 1999/08/31 19:13:31 gerd + * Added checks on proper PE nesting. The idea is that tokens such + * as Decl_element and Decl_rangle carry an entity ID with them. This ID + * is simply an object of type < >, i.e. you can only test on identity. + * The lexer always produces tokens with a dummy ID because it does not + * know which entity is the current one. The entity layer replaces the dummy + * ID with the actual ID. The parser checks that the IDs of pairs such as + * Decl_element and Decl_rangle are the same; otherwise a Validation_error + * is produced. + * + * Revision 1.9 1999/08/15 20:42:01 gerd + * Corrected a misleading message. + * + * Revision 1.8 1999/08/15 20:37:34 gerd + * Improved error messages. + * Bugfix: While parsing document entities, the subclass document_entity is + * now used instead of external_entity. The rules in document entities are a bit + * stronger. + * + * Revision 1.7 1999/08/15 14:03:59 gerd + * Empty documents are not allowed. + * "CDATA section not allowed here" is a WF_error, not a Validation_ + * error. + * + * Revision 1.6 1999/08/15 02:24:19 gerd + * Removed some grammar rules that were used for testing. + * Documents without DTD can now have arbitrary elements (formerly + * they were not allowed to have any element). + * + * Revision 1.5 1999/08/14 22:57:20 gerd + * It is allowed that external entities are empty because the + * empty string is well-parsed for both declarations and contents. Empty + * entities can be referenced anywhere because the references are replaced + * by nothing. Because of this, the Begin_entity...End_entity brace is only + * inserted if the entity is non-empty. (Otherwise references to empty + * entities would not be allowed anywhere.) + * As a consequence, the grammar has been changed such that a + * single Eof is equivalent to Begin_entity,End_entity without content. + * + * Revision 1.4 1999/08/14 22:20:01 gerd + * The "config" slot has now a component "warner" which is + * an object with a "warn" method. This is used to warn about characters + * that cannot be represented in the Latin 1 alphabet. + * Furthermore, there is a new component "debugging_mode". + * Some Parse_error exceptions have been changed into Validation_error. + * The interfaces of functions/classes imported from other modules + * have changed; the invocations have been adapted. + * Contents may contain CDATA sections that have been forgotten. + * + * Revision 1.3 1999/08/11 15:00:41 gerd + * The Begin_entity ... End_entity brace is also possible in + * 'contents'. + * The configuration passed to the parsing object contains always + * the resolver that is actually used. + * + * Revision 1.2 1999/08/10 21:35:12 gerd + * The XML/encoding declaration at the beginning of entities is + * evaluated. In particular, entities have now a method "xml_declaration" + * which returns the name/value pairs of such a declaration. The "encoding" + * setting is interpreted by the entity itself; "version", and "standalone" + * are interpreted by Markup_yacc.parse_document_entity. Other settings + * are ignored (this does not conform to the standard; the standard prescribes + * that "version" MUST be given in the declaration of document; "standalone" + * and "encoding" CAN be declared; no other settings are allowed). + * TODO: The user should be warned if the standard is not exactly + * fulfilled. -- The "standalone" property is not checked yet. + * + * Revision 1.1 1999/08/10 00:35:52 gerd + * Initial revision. + * + * + *)