X-Git-Url: http://matita.cs.unibo.it/gitweb/?p=helm.git;a=blobdiff_plain;f=helm%2FDEVEL%2Fpxp%2Fpxp%2Fpxp_yacc.m2y;fp=helm%2FDEVEL%2Fpxp%2Fpxp%2Fpxp_yacc.m2y;h=0000000000000000000000000000000000000000;hp=91de7cd2f06ff6ec81ee36ca1c85b696eaebf37d;hb=869549224eef6278a48c16ae27dd786376082b38;hpb=89262281b6e83bd2321150f81f1a0583645eb0c8 diff --git a/helm/DEVEL/pxp/pxp/pxp_yacc.m2y b/helm/DEVEL/pxp/pxp/pxp_yacc.m2y deleted file mode 100644 index 91de7cd2f..000000000 --- a/helm/DEVEL/pxp/pxp/pxp_yacc.m2y +++ /dev/null @@ -1,2528 +0,0 @@ -(* $Id$ -*- tuareg -*- - * ---------------------------------------------------------------------- - * PXP: The polymorphic XML parser for Objective Caml. - * Copyright by Gerd Stolpmann. See LICENSE for details. - *) - -open Parsing -open Pxp_types -open Pxp_lexer_types -open Pxp_dtd -open Pxp_entity -open Pxp_document -open Pxp_aux - -(* Some types from the interface definition: *) - -exception ID_not_unique - -class type [ 'ext ] index = -object - constraint 'ext = 'ext node #extension - method add : string -> 'ext node -> unit - method find : string -> 'ext node -end - - -type config = - { warner : collect_warnings; - errors_with_line_numbers : bool; - enable_pinstr_nodes : bool; - enable_super_root_node : bool; - enable_comment_nodes : bool; - encoding : rep_encoding; - recognize_standalone_declaration : bool; - store_element_positions : bool; - idref_pass : bool; - validate_by_dfa : bool; - accept_only_deterministic_models : bool; - debugging_mode : bool; - } - -type source = - Entity of ((dtd -> Pxp_entity.entity) * Pxp_reader.resolver) - | ExtID of (ext_id * Pxp_reader.resolver) - - -type start_symbol = - Ext_document - | Ext_declarations - | Ext_element - - -type context = - { mutable current : unit -> token; (* get the current token *) - mutable get_next : unit -> token; (* go on to the next token; return it *) - mutable current_token : token; (* This is the current token *) - mutable manager : entity_manager; (* The entity manager *) - } - - -let make_context entity_manager = - let c = - { current = (fun _ -> assert false); - get_next = (fun _ -> assert false); - current_token = Eof; - manager = entity_manager; - } - in - (* Note that the function which is stored in get_next_ref can be changed - * as a side-effect when an entity is opened or closed. The function in - * c.get_next must be programmed such that always the current "get_next" - * function is executed. - *) - let get_next_ref = entity_manager # yy_get_next_ref in - c.current <- (fun () -> c.current_token); - c.get_next <- (fun () -> let tok = !get_next_ref() in - c.current_token <- tok; - tok); - ignore(c.get_next()); - c -;; - - -let from_channel ?system_encoding ?id:init_id ?fixenc ch = - - (* Reading from a channel works by modifying the algorithm of - * resolve_as_file. - *) - - let url_syntax = (* A syntax suitable for "file" URLs *) - { Neturl.null_url_syntax with - Neturl.url_enable_scheme = Neturl.Url_part_allowed; - Neturl.url_enable_host = Neturl.Url_part_allowed; - Neturl.url_enable_path = Neturl.Url_part_required; - Neturl.url_accepts_8bits = true; - } - in - - let an_url = - Neturl.make_url - ~scheme: "file" - ~host: "" - ~path: [ "" ] - url_syntax - in - - let init_channel_done = ref false in - (* Whether the first access to this source has already happened. *) - - (* The task of url_of_id is: - * - When it is called the first time, and no init_id is present, - * the URL file:/// is passed back (an_url). This forces that - * absolute path names /path/dir/... will be interpreted as - * file path names. (But relative path names will not work.) - * - If an init_id has been passed, we can assume that the opened URL - * is exactly this init_id. By raising Not_competent it is indicated - * that the standard method is to be used for the interpretation of - * the URL. - * - Otherwise, the channel is already being read, and thus cannot again - * opened. (This case is handled in channel_of_url.) - *) - - let url_of_id xid = - if !init_channel_done then begin - (* Use the normal way of determining the URL of the ID: *) - raise Pxp_reader.Not_competent - end - else begin - match init_id with - None -> - an_url - (* If the channel is not associated with any URL: Simply pass - * the URL file:/// back. - *) - | Some the_init_id -> - assert (the_init_id = xid); - raise Pxp_reader.Not_competent - (* If the channel is associated with a URL, the corresponding - * ID must be passed when the first invocation happens. - *) - end - in - - (* The task of channel_of_url: - * - If it is called the first time ("else"), the channel is returned - * - Otherwise, the channel is already being read, and thus cannot again - * opened. By raising Not_competent it is signaled that the - * resolve_as_file object must not continue to open the URL. - *) - - let channel_of_url url = - if !init_channel_done then - raise Pxp_reader.Not_competent - else begin - init_channel_done := true; - ch, fixenc - end - in - - let r = - new Pxp_reader.resolve_as_file - ?system_encoding:system_encoding - ~url_of_id:url_of_id - ~channel_of_url:channel_of_url - () - in - - let init_xid = - match init_id with - None -> Anonymous - | Some id -> - (* Note: 'id' may be illegal (malformed); in this case, the first - * invocation of url_of_id will raise Not_competent, and the 'open_in' - * method will fail. - *) - id - in - - ExtID(init_xid, r) -;; - - -let from_file ?system_encoding utf8_filename = - - let r = - new Pxp_reader.resolve_as_file - ?system_encoding:system_encoding - () - in - - let utf8_abs_filename = - if utf8_filename <> "" && utf8_filename.[0] = '/' then - utf8_filename - else - Sys.getcwd() ^ "/" ^ utf8_filename - in - - let syntax = { Neturl.ip_url_syntax with Neturl.url_accepts_8bits = true } in - let url = Neturl.make_url - ~scheme:"file" - ~host:"localhost" - ~path:(Neturl.split_path utf8_abs_filename) - syntax - in - - let xid = System (Neturl.string_of_url url) in - - - ExtID(xid, r) -;; - - -let from_string ?fixenc s = - let r = - new Pxp_reader.resolve_read_this_string ?fixenc:fixenc s in - ExtID(Anonymous, r) -;; - - -(**********************************************************************) - -class ['ext] parser_object - init_doc init_dtd init_extend_dtd init_config init_resolver init_spec - init_process_xmldecl transform_dtd id_index - = - object (self) - - (* Note that the 'ext parameter has been the motivation to make the - * parser a class. - *) - - val mutable dtd = init_dtd - (* The DTD being parsed; or the DTD currently assumed *) - - val extend_dtd = init_extend_dtd - (* Whether the DTD should be extended by ELEMENT, ATTLIST, and - * NOTATION declarations or not. (True for validating mode, - * false for well-formedness mode.) - *) - - val transform_dtd = transform_dtd - (* A function transforming the DTD *) - - val id_index = (id_index : 'ext index option) - (* The ID index or None *) - - val process_xmldecl = init_process_xmldecl - (* Whether the XML declaration is parsed and the found XML version - * and standalone declaration are passed to 'doc'. - *) - - val lexerset = Pxp_lexers.get_lexer_set (init_config.encoding) - - val doc = init_doc - (* The current document *) - - method doc = (doc : 'ext document) - - val resolver = init_resolver - (* The resolver for external IDs *) - - val config = init_config - (* The current configuration *) - - val elstack = (Stack.create() : ('ext node * entity_id) Stack.t) - (* The element stack containing all open elements, i.e. elements that - * have begun by a start tag but that have not been finished (end tag). - * If the parser sees a start tag, it creates the element and pushes it - * on top of this stack. If the parser recognizes an end tag, it pulls - * one element from the stack and checks if it has the same name as - * given with the end tag. - * - * At initialization time, a special element is pushed on the stack, - * the so-called super root. It is always the bottommost - * element of the stack, and serves as a guard. - * [See "initializer" below.] - *) - - method current = - (* Get the top element of the element stack *) - try - fst(Stack.top elstack) - with - Stack.Empty -> assert false - (* Not possible, because the super root is always the element - * at the bottom of the stack. - *) - - val mutable n_tags_open = 0 - (* Number of begin tags that have been parsed and whose corresponding - * end tags have not yet been parsed - *) - - val mutable p_internal_subset = false - (* true while parsing the internal subset - there are some additional - * constraints for internal subsets, and because of this it must - * be known whether the current declaration is contained in the - * internal or external subset of the DTD. - *) - - val mutable root = None - (* Contains the root element (topmost element) while it is being parsed - * and after it has been parsed. - * This variable is None before the root element is seen. - *) - - method root = root - - val spec = init_spec - (* A hashtable that contains exemplar objects for the various element - * types. If an element is parsed, the exemplar is looked up and - * "cloned" (by the "create" method) - *) - - val mutable current_data = [] - (* Collects character data. *) - - method collect_data s = - (* Collects the character material 's' *) - current_data <- s :: current_data - - method save_data = - (* Puts the material collected in 'current_data' into a new - * node, and appends this node as new sub node to 'current' - *) - match current_data with - [] -> - () - | [ str ] -> - if str <> "" then - self # current # add_node (create_data_node spec dtd str); - current_data <- [] - | _ -> - let count = List.fold_left - (fun acc s -> acc + String.length s) - 0 - current_data in - let str = String.create count in - let pos = ref count in - List.iter - (fun s -> - let l = String.length s in - pos := !pos - l; - String.blit - ~src:s - ~src_pos:0 - ~dst:str - ~dst_pos:(!pos) - ~len:l - ) - current_data; - assert(!pos = 0); - if str <> "" then - self # current # add_node (create_data_node spec dtd str); - current_data <- [] - - - method only_whitespace data = - (* Checks that the string "data" contains only whitespace. On failure, - * Validation_error is raised. - *) - let lexbuf = Lexing.from_string data in - let t1 = lexerset.scan_name_string lexbuf in - if t1 <> Ignore then - raise(WF_error("Data not allowed here")); - let t2 = lexerset.scan_name_string lexbuf in - if t2 <> Eof then - raise(WF_error("Data not allowed here")); - () - - initializer - (* CHECKS: *) - if config.encoding <> dtd # encoding then - failwith("Encoding mismatch"); - - (* --- Initialize 'elstack': Push the super-root on the stack. *) - let super_root = - if config.enable_super_root_node then - create_super_root_node spec dtd - else - (* because spec may not contain an exemplar for the super root: *) - create_no_node spec dtd - in - (* Move the super root or the emulation to the stack: *) - Stack.push (super_root, (self :> entity_id)) elstack; - - - - (********* Here the method "parse" begins. The grammar below is - * transformed to a local function of this method - *) - - method parse context start_symbol = - - let parse_ignored_section yy_current yy_get_next = - (* A special parser which should be used after is found. - *) - - while yy_current() = Ignore do - ignore(yy_get_next()); - done; - - ( match yy_current() with - Conditional_body _ -> () - | _ -> raise Parsing.Parse_error; - ); - - let en = context.manager # current_entity in - let llev = ref 1 in - while !llev >= 1 do - let igntok = en # next_ignored_token in - (* next_ignored_token: uses a special lexer that only - * recognizes Conditional_begin and Conditional_end; - * other character combinations are ignored. - *) - (* NOTE: next_ignored_token works much like yy_get_next, - * but it does not set the current token! - *) - match igntok with - Conditional_begin _ -> - llev := !llev + 1 - | Conditional_end _ -> - llev := !llev - 1; - (* Because the loop may be exited now: *) - context.current_token <- igntok; - | (End_entity | Eof) -> - raise Parsing.Parse_error - | _ -> - () - done; - - in - - - let check_and_parse_xmldecl xmldecl = - if process_xmldecl then begin - let v, _, s = decode_doc_xml_pi (decode_xml_pi xmldecl) in - check_version_num v; - doc # init_xml_version v; - let v = match s with - None -> false - | Some "yes" -> true - | Some "no" -> false - | _ -> raise (WF_error("Illegal 'standalone' declaration")) - in - if config.recognize_standalone_declaration then - dtd # set_standalone_declaration v - end - in - - let recode_utf8 s = - (* Recode 's' to UTF-8 *) - if config.encoding = `Enc_utf8 then - s (* No recoding necessary *) - else - Netconversion.recode_string - ~in_enc:(config.encoding :> encoding) ~out_enc:`Enc_utf8 s - in - - -%% - -/* The following grammar looks similar to ocamlyacc grammars, but - * ocamlyacc is actually not used to transform the grammar into a parser. - * Instead, the parser generator m2parsergen is applied. - * - * The format of the grammar is different (see m2parsergen/README), - * but I hope that you can understand most features immediately. - * - * The type of the parser is different: m2parsergen creates a top-down - * parser while ocamlyacc generates a LALR-1 parser. - * - * The way the generated code is called is different: ocamlyacc produces - * lots of top-level definitions whereas m2parsergen generates only - * a local let-in-phrase. This is explained in the already mentioned - * README file. - */ - -/* See Pxp_types.ml for comments to the various tokens */ - -%token Begin_entity -%token End_entity -%token Comment_begin -%token Comment_end -%token Ignore -%token Eq -%token Rangle -%token Rangle_empty -%token <> Conditional_begin -%token <> Conditional_body -%token <> Conditional_end -%token Percent -%token Plus -%token Star -%token Bar -%token Comma -%token Qmark -%token Pcdata -%token Required -%token Implied -%token Fixed -%token Eof - -%token <> Comment_material -%token <> Doctype -%token <> Doctype_rangle -%token <> Dtd_begin -%token <> Dtd_end -%token <> Decl_element -%token <> Decl_attlist -%token <> Decl_entity -%token <> Decl_notation -%token <> Decl_rangle -%token <> Lparen -%token <> Rparen -%token <> RparenPlus -%token <> RparenStar -%token <> RparenQmark - -%token <> Tag_beg -%token <> Tag_end - -%token <> PI -%token <> PI_xml -%token <> Cdata -%token <> CRef -%token <> ERef -%token <> PERef -%token <> CharData -%token <> LineEnd -%token <> Name -%token <> Nametoken -%token <> Attval -%token <> Attval_nl_normalized -%token <> Unparsed_string - -/* START SYMBOLS: - * - * "ext_document": parses a complete XML document (i.e. containing a - * and an element) - * "ext_declarations": parses an "external DTD subset", i.e. a sequence - * of declarations - * "ext_element": parses a single element (no allowed); - * the element needs not to be the root element of the - * DTD - * - * The functions corresponding to these symbols return always () because - * they only have side-effects. - */ - -/* SOME GENERAL COMMENTS: - * - * The parser does not get its tokens from the lexers directly. Instead of - * this, there is an entity object between the parser and the lexers. This - * object already handles: - * - * - References to general and parameter entities. The token stream is - * modified such that tokens automatically come from the referenced entities. - * External parameter entities and all general entities are embraced by - * the two special tokens Begin_entity and End_entity. The parser must - * check that these braces are correctly nested. - */ - -%% - - -ext_document(): - Begin_entity - doc_xmldecl_then_misc_then_prolog_then_rest() End_entity - {{ - if n_tags_open <> 0 then - raise(WF_error("Missing end tag")) - }} - - -/* In the following rule, we must find out whether there is an XML declaration - * or not, and directly after that either "process_xmldecl" or - * "process_missing_xmldecl" of the current entity must be called. - * AND IT MUST BE DIRECTLY! Because of this, the invocation is carried out - * in the "$" clause immediately following the first token. - * - * TODO: This is not enough. The first token may be a tag, and the tag - * may already contain non-ASCII characters. (But in this case, the resolvers - * assume UTF8, and they are right...) - */ - -doc_xmldecl_then_misc_then_prolog_then_rest(): - pl:PI_xml - $ {{ context.manager # current_entity # process_xmldecl pl; - check_and_parse_xmldecl pl; - }} - misc()* doc_prolog_then_rest() - {{ () }} - -| $ {{ context.manager # current_entity # process_missing_xmldecl; }} - misc() misc()* doc_prolog_then_rest() - {{ () }} - -| $ {{ context.manager # current_entity # process_missing_xmldecl; }} - doctypedecl() misc()* contents_start() - {{ () }} - -| $ {{ context.manager # current_entity # process_missing_xmldecl; }} - contents_start() - {{ () }} - - -doc_prolog_then_rest(): - doctypedecl() misc()* contents_start() - {{ () }} -| contents_start() - {{ () }} - - -ext_element(): - Begin_entity el_xmldecl_then_misc_then_rest() End_entity - {{ - if n_tags_open <> 0 then - raise(WF_error("Missing end tag")) - }} - - -/* See comment for doc_mldecl_then_misc_then_prolog_then_rest. */ - -el_xmldecl_then_misc_then_rest(): - pl:PI_xml - $ {{ context.manager # current_entity # process_xmldecl pl; }} - misc()* contents_start() - {{ () }} - -| $ {{ context.manager # current_entity # process_missing_xmldecl; }} - misc() misc()* contents_start() - {{ () }} - -| $ {{ context.manager # current_entity # process_missing_xmldecl; }} - contents_start() - {{ () }} - - -ext_declarations(): - /* Parses a sequence of declarations given by an entity. As side-effect, - * the parsed declarations are put into the dtd object. - */ - Begin_entity decl_xmldecl_then_rest() - {{ () }} -| Eof - {{ () }} - - -decl_xmldecl_then_rest(): - /* Note: This rule is also called from declaration()! */ - pl:PI_xml - $ {{ context.manager # current_entity # process_xmldecl pl; - }} - declaration()* End_entity - {{ () }} - -| $ {{ context.manager # current_entity # process_missing_xmldecl; }} - declaration() declaration()* End_entity - {{ () }} - -| $ {{ context.manager # current_entity # process_missing_xmldecl; }} - End_entity - {{ () }} - - -misc(): - pi() - {{ () }} -| data: CharData - /* In this context, the lexers sometimes do not recognize white space; - * instead CharData tokens containing white space are delivered. - */ - {{ self # only_whitespace data }} -| Ignore - {{ () }} -| comment() - {{ () }} - - -/********************* DOCUMENT TYPE DECLARATION *************************/ - -doctypedecl(): - /* parses from . As side-effect, first the declarations of - * the internal DTD (if any) are put into !!on_dtd, then the declarations - * of the external DTD (if any) are put into this DTD object. - */ - doctype_entid: Doctype - ws: Ignore Ignore* - doctypedecl_material (doctype_entid) - {{ () }} - ? {{ match !yy_position with - "ws" -> raise(WF_error("Whitespace is missing after `DOCTYPE'")) - | _ -> raise(WF_error("Bad DOCTYPE declaration")) - }} - - -/* TRICK: - * ws: Ignore? Ignore* - * is meant seriously. The effect is that ws becomes a boolean variable - * which is true if there is an Ignore token and false otherwise. - * This construct is faster than just - * ws: Ignore* - * in which case ws becomes an integer variable containing the number of - * Ignore tokens. Counting the number of tokens is slower than only checking - * the existence. - * - * We need the information whether there is an Ignore token (representing - * white space), because white space is only obligatory if also an identifier - * for the external subset is parsed; this conditional syntax constraint is - * simply programmed in the body of the grammar rule. - */ - -doctypedecl_material(doctype_entid): - root_name: Name - ws: Ignore? Ignore* - external_subset: external_id()? - Ignore* - internal_subset: internal_dtd()? - Ignore* - doctype_rangle_entid: Doctype_rangle - {{ - if doctype_entid != doctype_rangle_entid then - raise (Validation_error("Entities not properly nested with DOCTYPE declaration")); - dtd # set_root root_name; - begin match external_subset, internal_subset with - None, None -> () (* no DTD means no ID *) - | None, Some _ -> dtd # set_id Internal - | Some id, None -> dtd # set_id (External id) - | Some id, Some _ -> dtd # set_id (Derived id) - end; - (* Get now the external doctype declaration. Note that the internal - * subset has precedence and must be read first. - *) - begin match external_subset with - None -> () - | Some id -> - if not ws then - raise(WF_error("Whitespace is missing after `DOCTYPE " ^ - root_name ^ "'")); - let r' = resolver # clone in - let pobj = - new parser_object - (new document config.warner) - dtd - extend_dtd - config - r' - spec - process_xmldecl - (fun x -> x) - None - in - let en = new external_entity r' dtd "[dtd]" - config.warner id false config.errors_with_line_numbers - config.encoding - in - en # set_debugging_mode (config.debugging_mode); - let mgr = new entity_manager en in - en # open_entity true Declaration; - try - let context = make_context mgr in - pobj # parse context Ext_declarations; - ignore(en # close_entity); - with - error -> - ignore(en # close_entity); - r' # close_all; - let pos = mgr # position_string in - raise (At(pos, error)) - end; - dtd # validate - }} - ? {{ - match !yy_position with - "doctype_rangle_entid" -> raise(WF_error("`>' expected")) - | _ -> raise(WF_error("Bad DOCTYPE declaration")) - }} - -/* Note that there are no keywords for SYSTEM or PUBLIC, as these would - * be difficult to recognize in the lexical contexts. Because of this, - * SYSTEM/PUBLIC is parsed as name, and the rule for everything after - * SYSTEM/PUBLIC is computed dynamically. - */ - -external_id(): - tok:Name - $ {{ - let followup = - match tok with - "SYSTEM" -> parse_system_id - (* Apply the rule system_id (below) to parse the - * rest of the ID - *) - | "PUBLIC" -> parse_public_id - (* Apply the rule public_id (below) to parse the - * rest of the ID - *) - | _ -> raise(WF_error("SYSTEM or PUBLIC expected")) - in - }} - ws:Ignore Ignore* - r:[followup]() - {{ r }} - ? {{ match !yy_position with - "ws" -> raise(WF_error("Whitespace is missing after " ^ tok)) - | _ -> raise(WF_error("Bad SYSTEM or PUBLIC identifier")) - }} - - -system_id(): - str:Unparsed_string - {{ System (recode_utf8 str) }} - - -public_id(): - str1: Unparsed_string - ws: Ignore Ignore* - str2: Unparsed_string - {{ check_public_id str1; - Public(recode_utf8 str1, recode_utf8 str2) - }} - ? {{ match !yy_position with - "ws" -> raise(WF_error("Whitespace is missing between the literals of the PUBLIC identifier")) - | _ -> raise(WF_error("Bad PUBLIC identifier")) - }} - - -/* The internal subset: "[" declaration* "]". While parsing the declarations - * the object variable p_internal_subset must be true; however, if there - * are entity references, this variable must be reset to false during - * the entity. (See the rule for "declaration" below.) - */ - -internal_dtd(): - dtd_begin_entid: internal_dtd_begin() - declaration()* - dtd_end_entid: internal_dtd_end() - {{ - if dtd_begin_entid != dtd_end_entid then - raise(Validation_error("Entities not properly nested with internal DTD subset")) - }} - ? {{ match !yy_position with - "dtd_end_entid" -> raise(WF_error("`]' expected")) - | _ -> raise(WF_error("Bad internal DTD subset")) - }} - - -internal_dtd_begin(): - Dtd_begin - {{ assert (not p_internal_subset); - p_internal_subset <- true }} - - -internal_dtd_end(): - Dtd_end - {{ assert p_internal_subset; - p_internal_subset <- false }} - - -declaration(): - /* Parses a single declaration (or processing instruction). As side-effect - * the parsed declaration is stored into the dtd object. - */ - elementdecl() - {{ () }} -| attlistdecl() - {{ () }} -| entid:Decl_entity ws:Ignore Ignore* e:entitydecl(entid) - {{ () }} - ? {{ match !yy_position with - "ws" -> raise(WF_error("Whitespace is missing after ENTITY")) - | "e" -> raise(WF_error("Name or `%' expected")) - | _ -> raise(WF_error("Bad entity declaration")) - }} -| notationdecl() - {{ () }} -| pi: PI - {{ let target, value = pi in - let pi = new proc_instruction target value config.encoding in - dtd # add_pinstr pi - }} -| Ignore - {{ () }} -| Comment_begin Comment_material* ce:Comment_end - {{ () }} - ? {{ match !yy_position with - "ce" -> raise(WF_error("`-->' expected")) - | _ -> raise(WF_error("Bad comment")) - }} -| Begin_entity - $ {{ (* Set 'p_internal_subset' to 'false' until the matching 'end_entity' - * rule is parsed. This allows unrestricted usage of parameter entities - * within declarations of internal entities. - *) - let old_p_internal_subset = p_internal_subset in - p_internal_subset <- false; - }} - decl_xmldecl_then_rest() - {{ (* Restore the old value of 'p_internal_subset'. *) - p_internal_subset <- old_p_internal_subset; - () - }} -| begin_entid:Conditional_begin - $ {{ (* Check whether conditional sections are allowed at this position. *) - if p_internal_subset then - raise(WF_error("Restriction of the internal subset: Conditional sections not allowed")); - }} - Ignore* - cond:conditional_section() end_entid:Conditional_end - {{ (* Check whether Conditional_begin and Conditional_end are in the same - * entity. (This restriction is explained in the file SPECS.) - *) - if begin_entid != end_entid then - raise(Validation_error("The first and the last token of conditional sections must be in the same entity (additional restriction of this parser)")); - }} - ? {{ match !yy_position with - "end_entid" -> raise(WF_error("`>]>' expected")) - | "cond" -> raise(WF_error("INCLUDE or IGNORE expected")) - | _ -> raise(WF_error("Bad conditional section")) - }} - -/* The tokens INCLUDE/IGNORE are scanned as names, and the selection of the - * right parsing rule is dynamic. - * Note that parse_ignored_section is not defined by a grammar rule but - * by a conventional let-binding above. - */ - -conditional_section(): - include_or_ignore:Name - $ {{ let parsing_function = - match include_or_ignore with - "INCLUDE" -> parse_included_section - (* invoke rule "included_section" below *) - | "IGNORE" -> parse_ignored_section - (* invoke function "parse_ignored_section" *) - | _ -> raise(WF_error("INCLUDE or IGNORE expected")) - in - }} - [ parsing_function ] () - {{ () }} - ? {{ raise(WF_error("Bad conditional section")) }} - -included_section(): - Conditional_body declaration()* - {{ () }} -| Ignore Ignore* Conditional_body declaration()* - {{ () }} - - -/*************************** ELEMENT DECLARATIONS ********************/ - -elementdecl(): - /* parses . Puts the parsed element type as side-effect into - * dtd. - */ - decl_element_entid: Decl_element - $ {{ let extdecl = context.manager # current_entity_counts_as_external in - }} - ws1: Ignore Ignore* - name: Name - ws2: Ignore Ignore* - content_model: contentspec() - Ignore* - decl_rangle_entid: Decl_rangle - {{ - if decl_element_entid != decl_rangle_entid then - raise (Validation_error "Entities not properly nested with ELEMENT declaration"); - if extend_dtd then begin - let el = new dtd_element dtd name in - (* It is allowed that an precedes the corresponding - * . Because of this it is possible that there is already - * an element called 'name' in the DTD, and we only must set the content - * model of this element. - *) - try - dtd # add_element el; - el # set_cm_and_extdecl content_model extdecl; - with - Not_found -> (* means: there is already an element 'name' *) - let el' = dtd # element name in - el' # set_cm_and_extdecl content_model extdecl; - (* raises Validation_error if el' already has a content model *) - end - }} - ? {{ match !yy_position with - ("ws1"|"ws2") -> raise(WF_error("Whitespace is missing")) - | "name" -> raise(WF_error("The name of the element is expected here")) - | "content_model" -> raise(WF_error("Content model expression expected")) - | "decl_rangle_entid" -> raise(WF_error("`>' expected")) - | _ -> raise(WF_error("Bad element type declaration")) - }} - -contentspec(): - /* parses a content model and returns it (type content_model_type) */ - name: Name /* EMPTY or ANY */ - {{ match name with - "EMPTY" -> Empty - | "ANY" -> Any - | _ -> raise(WF_error("EMPTY, ANY, or a subexpression expected")) - }} -| entid:Lparen Ignore* term:mixed_or_regexp(entid) - {{ term }} - ? {{ raise(WF_error("Bad content model expression")) }} - - -/* Many of the following rules have an lparen_entid argument. This is the - * internal ID of the entity containing the corresponding left parenthesis; - * by comparing it with the ID of the entity of the right parenthesis the - * contraint is implemented that both parentheses must be in the same entity. - */ - -mixed_or_regexp(lparen_entid): - re: choice_or_seq(lparen_entid) - {{ Regexp re }} -| m: mixed(lparen_entid) - {{ m }} - - -multiplier(): - /* returns one of the multiplier symbols (?,*,+) */ - Plus - {{ Plus }} -| Star - {{ Star }} -| Qmark - {{ Qmark }} - - -mixed (lparen_entid) : - Pcdata - Ignore* - material: mixed_alternatives_top() - {{ - let rest, rparen_entid = material in - if lparen_entid != rparen_entid then - raise (Validation_error "Entities not properly nested with parentheses"); - Mixed (MPCDATA :: rest) - }} - ? {{ raise(WF_error("Bad content model expression")) }} - - -mixed_alternatives_top(): - entid: Rparen - {{ [], entid }} -| entid: RparenStar - {{ [], entid }} -| Bar Ignore* name:Name Ignore* names:mixed_alternative()* entid:RparenStar - {{ - (MChild name :: names), entid - }} - ? {{ match !yy_position with - "name" -> raise(WF_error("Name expected")) - | "entid" -> raise(WF_error("`)*' expected")) - | _ -> raise(WF_error("Bad content model expression")) - }} - - -mixed_alternative() : - Bar Ignore* name:Name Ignore* - {{ MChild name }} - ? {{ match !yy_position with - "name" -> raise(WF_error("Name expected")) - | _ -> raise(WF_error("Bad content model expression")) - }} - - - -choice_or_seq (lparen_entid): - /* parses either a regular expression, or a mixed expression. Returns - * Mixed spec or Regexp spec (content_model_type). - * Which kind of expression (regexp or mixed) is being read is recognized - * after the first subexpression has been parsed; the other subexpressions - * must be of the same kind. - */ - re: cp() - Ignore* - factor: choice_or_seq_factor() - {{ - let (finalmark,subexpr), rparen_entid = factor in - if lparen_entid != rparen_entid then - raise (Validation_error "Entities not properly nested with parentheses"); - (* Check that the other subexpressions are "regexp", too, and - * merge them with the first. - *) - let re' = - match subexpr with - Alt [] -> re - | Alt alt -> Alt (re :: alt) - | Seq seq -> Seq (re :: seq) - | _ -> assert false - in - (* Interpret the finalmark. *) - match finalmark with - Ignore -> re' - | Plus -> Repeated1 re' - | Star -> Repeated re' - | Qmark -> Optional re' - | _ -> assert false - }} - ? {{ raise(WF_error("Bad content model expression")) }} - -choice_or_seq_factor(): - /* Parses "||...)" or ",,...)", both forms optionally - * followed by ?, *, or +. - * Returns ((finalmark, expr), rparen_entid), where - * - finalmark is the character after the right parenthesis or Ignore - * - expr is either - * Alt [] meaning that only ")" has been found - * Alt non_empty_list meaning that the subexpressions are separated by '|' - * Seq non_empty_list meaning that the subexpressions are separated by ',' - */ - entid:Rparen - {{ (Ignore, Alt []), entid }} -| entid:RparenPlus - {{ (Plus, Alt []), entid }} -| entid:RparenStar - {{ (Star, Alt []), entid }} -| entid:RparenQmark - {{ (Qmark, Alt []), entid }} -| Bar Ignore* re:cp() Ignore* factor:choice_or_seq_factor() - {{ - let (finalmark, subexpr), rparen_entid = factor in - begin match subexpr with - Alt [] -> (finalmark, (Alt [re])), rparen_entid - | Alt alt -> (finalmark, (Alt (re :: alt))), rparen_entid - | _ -> raise(WF_error("It is not allowed to mix alternatives and sequences")) - end - }} - ? {{ raise(WF_error("Bad content model expression")) }} -| Comma Ignore* re:cp() Ignore* factor:choice_or_seq_factor() - {{ - let (finalmark, subexpr), rparen_entid = factor in - begin match subexpr with - Alt [] -> (finalmark, (Seq [re])), rparen_entid - | Seq seq -> (finalmark, (Seq (re :: seq))), rparen_entid - | _ -> raise(WF_error("It is not allowed to mix alternatives and sequences")) - end - }} - ? {{ raise(WF_error("Bad content model expression")) }} - -cp(): - /* parse either a name, or a parenthesized subexpression "(...)" */ - name:Name m:multiplier()? - {{ match m with - None -> Child name - | Some Plus -> Repeated1 (Child name) - | Some Star -> Repeated (Child name) - | Some Qmark -> Optional (Child name) - | _ -> assert false - }} - ? {{ raise(WF_error("Bad content model expression")) }} -| entid:Lparen Ignore* m:choice_or_seq(entid) - {{ m }} - ? {{ raise(WF_error("Bad content model expression")) }} - - -/********************* ATTRIBUTE LIST DECLARATION ***********************/ - -attlistdecl(): - /* parses . Enters the attribute list in dtd as side- - * effect. - */ - decl_attlist_entid: Decl_attlist - $ {{ let extdecl = context.manager # current_entity_counts_as_external in - }} - ws1: Ignore Ignore* - el_name: Name - ws: Ignore? Ignore* - factor: attdef_factor() - {{ - let at_list, decl_rangle_entid = factor in - - if decl_attlist_entid != decl_rangle_entid then - raise (Validation_error "Entities not properly nested with ATTLIST declaration"); - - if not ws && at_list <> [] then begin - match at_list with - (name,_,_) :: _ -> - (* This is normally impossible, because the lexer demands - * some other token between two names. - *) - raise(WF_error("Whitespace is missing before `" ^ name ^ "'")); - | _ -> assert false - end; - - if extend_dtd then begin - let new_el = new dtd_element dtd el_name in - (* Note that it is allowed that precedes the corresponding - * declaration. In this case we add the element declaration - * already to the DTD but leave the content model unspecified. - *) - let el = - try - dtd # add_element new_el; - new_el - with - Not_found -> (* already added *) - let old_el = dtd # element el_name in - if old_el # attribute_names <> [] then - config.warner # warn ("More than one ATTLIST declaration for element type `" ^ - el_name ^ "'"); - old_el - in - List.iter - (fun (a_name, a_type, a_default) -> - el # add_attribute a_name a_type a_default extdecl) - at_list - end - }} - ? {{ match !yy_position with - "ws1" -> raise(WF_error("Whitespace is missing after ATTLIST")) - | "el_name" -> raise(WF_error("The name of the element is expected here")) - | "factor" -> raise(WF_error("Another attribute name or `>' expected")) - | _ -> raise(WF_error("Bad attribute declaration")) - }} - - -attdef_factor(): - /* parses a list of triples and returns the - * list as (string * att_type * att_default) list. - */ - attdef:attdef() ws:Ignore? Ignore* factor:attdef_factor() - {{ - let attdef_rest, decl_rangle_entid = factor in - if not ws && attdef_rest <> [] then begin - match attdef_rest with - (name,_,_) :: _ -> - raise(WF_error("Missing whitespace before `" ^ name ^ "'")); - | _ -> assert false - end; - (attdef :: attdef_rest), decl_rangle_entid }} - ? {{ match !yy_position with - | "factor" -> raise(WF_error("Another attribute name or `>' expected")) - | _ -> raise(WF_error("Bad attribute declaration")) - }} -| entid:Decl_rangle - {{ [], entid }} - - -attdef(): - /* Parses a single triple */ - name: Name - ws1: Ignore Ignore* - tp: atttype() - ws2: Ignore Ignore* - default: defaultdecl() - {{ (name,tp,default) }} - ? {{ match !yy_position with - ("ws1"|"ws2") -> raise(WF_error("Whitespace is missing")) - | "tp" -> raise(WF_error("Type of attribute or `(' expected")) - | "default" -> raise(WF_error("#REQUIRED, #IMPLIED, #FIXED or a string literal expected")) - | _ -> raise(WF_error("Bad attribute declaration")) - }} - -atttype(): - /* Parses an attribute type and returns it as att_type. */ - name: Name - $ {{ let followup = - if name = "NOTATION" then - parse_notation - else - parse_never - in - }} - nota: [followup]()? - {{ - match name with - "CDATA" -> A_cdata - | "ID" -> A_id - | "IDREF" -> A_idref - | "IDREFS" -> A_idrefs - | "ENTITY" -> A_entity - | "ENTITIES" -> A_entities - | "NMTOKEN" -> A_nmtoken - | "NMTOKENS" -> A_nmtokens - | "NOTATION" -> - (match nota with - None -> raise(WF_error("Error in NOTATION type (perhaps missing whitespace after NOTATION?)")) - | Some n -> n - ) - | _ -> raise(WF_error("One of CDATA, ID, IDREF, IDREFS, ENTITY, ENTITIES, NMTOKEN, NMTOKENS, NOTATION, or a subexpression expected")) - }} - ? {{ raise(WF_error("Bad attribute declaration (perhaps missing whitespace after NOTATION)")) }} - -| Lparen - Ignore* - name: name_or_nametoken() - Ignore* - names: nmtoken_factor()* - rp: Rparen - /* Enumeration */ - {{ A_enum(name :: names) }} - ? {{ match !yy_position with - "name" -> raise(WF_error("Name expected")) - | "names" -> raise(WF_error("`|' and more names expected, or `)'")) - | "rp" -> raise(WF_error("`|' and more names expected, or `)'")) - | _ -> raise(WF_error("Bad enumeration type")) - }} - - -never(): - /* The always failing rule */ - $ {{ raise Not_found; }} - Doctype /* questionable */ - {{ A_cdata (* Does not matter *) - }} - - -notation(): - Ignore Ignore* - lp: Lparen - Ignore* - name: Name - Ignore* - names: notation_factor()* - rp: Rparen - {{ A_notation(name :: names) }} - ? {{ match !yy_position with - "lp" -> raise(WF_error("`(' expected")) - | "name" -> raise(WF_error("Name expected")) - | "names" -> raise(WF_error("`|' and more names expected, or `)'")) - | "rp" -> raise(WF_error("`|' and more names expected, or `)'")) - | _ -> raise(WF_error("Bad NOTATION type")) - }} - - -notation_factor(): - /* Parse "|" and return the name */ - Bar Ignore* name:Name Ignore* - {{ name }} - ? {{ match !yy_position with - "name" -> raise(WF_error("Name expected")) - | _ -> raise(WF_error("Bad NOTATION type")) - }} - -nmtoken_factor(): - /* Parse "|" and return the nmtoken */ - Bar Ignore* n:name_or_nametoken() Ignore* - {{ n }} - ? {{ match !yy_position with - "n" -> raise(WF_error("Nametoken expected")) - | _ -> raise(WF_error("Bad enumeration type")) - }} - - -name_or_nametoken(): - n:Name {{ n }} -| n:Nametoken {{ n }} - - -/* The default values must be expanded and normalized. This has been implemented - * by the function expand_attvalue. - */ - - -defaultdecl(): - /* Parse the default value for an attribute and return it as att_default */ - Required - {{ D_required }} -| Implied - {{ D_implied }} -| Fixed ws:Ignore Ignore* str:Unparsed_string - {{ D_fixed (expand_attvalue lexerset dtd str config.warner false) }} - ? {{ match !yy_position with - "ws" -> raise(WF_error("Whitespace is missing after #FIXED")) - | "str" -> raise(WF_error("String literal expected")) - | _ -> raise(WF_error("Bad #FIXED default value")) - }} -| str:Unparsed_string - {{ D_default (expand_attvalue lexerset dtd str config.warner false) }} - - -/**************************** ENTITY DECLARATION ***********************/ - -entitydecl(decl_entity_entid): - /* parses everything _after_ . The parsed - * entity declaration is entered into the dtd object as side-effect. - */ - name: Name - $ {{ let extdecl = context.manager # current_entity_counts_as_external in - }} - ws: Ignore Ignore* - material: entitydef() - Ignore* - decl_rangle_entid: Decl_rangle - /* A general entity */ - {{ - if decl_entity_entid != decl_rangle_entid then - raise (Validation_error "Entities not properly nested with ENTITY declaration"); - let en = - (* Distinguish between - * - internal entities - * - external entities - * - NDATA (unparsed) entities - *) - match material with - (Some s, None, None) -> - new internal_entity dtd name config.warner s p_internal_subset - config.errors_with_line_numbers false config.encoding - | (None, Some xid, None) -> - new external_entity (resolver # clone) dtd name config.warner - xid false config.errors_with_line_numbers - config.encoding - - | (None, Some xid, Some n) -> - (new ndata_entity name xid n config.encoding :> entity) - | _ -> assert false - in - dtd # add_gen_entity en extdecl - }} - ? {{ match !yy_position with - "ws" -> raise(WF_error("Whitespace is missing")) - | "material" -> raise(WF_error("String literal or identifier expected")) - | "decl_rangle_entid" -> raise(WF_error("`>' expected")) - | _ -> raise(WF_error("Bad entity declaration")) - }} - -| Percent - $ {{ let extdecl = context.manager # current_entity_counts_as_external in - }} - ws1: Ignore Ignore* - name: Name - ws2: Ignore Ignore* - material: pedef() - Ignore* - decl_rangle_entid: Decl_rangle - /* A parameter entity */ - {{ - if decl_entity_entid != decl_rangle_entid then - raise (Validation_error "Entities not properly nested with ENTITY declaration"); - let en = - (* Distinguish between internal and external entities *) - match material with - (Some s, None) -> - new internal_entity dtd name config.warner s p_internal_subset - config.errors_with_line_numbers true config.encoding - | (None, Some xid) -> - new external_entity (resolver # clone) dtd name config.warner - xid true config.errors_with_line_numbers - config.encoding - | _ -> assert false - in - - (* The following two lines force that even internal entities count - * as external (for the standalone check) if the declaration of - * the internal entity occurs in an external entity. - *) - if extdecl then - en # set_counts_as_external; - - dtd # add_par_entity en; - }} - ? {{ match !yy_position with - ("ws1"|"ws2") -> raise(WF_error("Whitespace is missing")) - | "material" -> raise(WF_error("String literal or identifier expected")) - | "decl_rangle_entid" -> raise(WF_error("`>' expected")) - | _ -> raise(WF_error("Bad entity declaration")) - }} - - -entitydef(): - /* parses the definition value of a general entity. Returns either: - * - (Some s, None, None) meaning the definition of an internal entity - * with (literal) value s has been found - * - (None, Some x, None) meaning that an external parsed entity with - * external ID x has been found - * - (None, Some x, Some n) meaning that an unparsed entity with - * external ID x and notations n has been found - */ - str:Unparsed_string - {{ Some str, None, None }} -| id:external_id() ws:Ignore? Ignore* decl:ndatadecl()? - {{ if not ws && decl <> None then - raise(WF_error("Whitespace missing before `NDATA'")); - None, Some id, decl - }} - - -pedef(): - /* parses the definition value of a parameter entity. Returns either: - * - (Some s, None) meaning that the definition of an internal entity - * with (literal) value s has been found - * - (None, Some x) meaning that an external ID x has been found - */ - str:Unparsed_string - {{ Some str, None }} -| id:external_id() - {{ None, Some id }} - - -ndatadecl(): - /* Parses either NDATA "string" or the empty string; returns Some "string" - * in the former, None in the latter case. - */ - ndata:Name ws:Ignore Ignore* name:Name - {{ if ndata = "NDATA" then - name - else - raise(WF_error("NDATA expected")) - }} - ? {{ match !yy_position with - "ws" -> raise(WF_error("Whitespace is missing after NDATA")) - | "name" -> raise(WF_error("Name expected")) - | _ -> raise(WF_error("Bad NDATA declaration")) - }} - -/**************************** NOTATION DECLARATION *******************/ - -notationdecl(): - /* parses and enters the notation declaration into the - * dtd object as side-effect - */ - decl_notation_entid: Decl_notation - ws1: Ignore Ignore* - name: Name - ws2: Ignore Ignore* - sys_or_public: Name /* SYSTEM or PUBLIC */ - ws3: Ignore Ignore* - str1: Unparsed_string - ws: Ignore? Ignore* - str2: Unparsed_string? - Ignore* - decl_rangle_entid: Decl_rangle - {{ - if decl_notation_entid != decl_rangle_entid then - raise (Validation_error "Entities not properly nested with NOTATION declaration"); - let xid = - (* Note that it is allowed that PUBLIC is only followed by one - * string literal - *) - match sys_or_public with - "SYSTEM" -> - if str2 <> None then raise(WF_error("SYSTEM must be followed only by one argument")); - System (recode_utf8 str1) - | "PUBLIC" -> - begin match str2 with - None -> - check_public_id str1; - Public(recode_utf8 str1,"") - | Some p -> - if not ws then - raise(WF_error("Missing whitespace between the string literals of the `PUBLIC' id")); - check_public_id str1; - Public(recode_utf8 str1, recode_utf8 p) - end - | _ -> raise(WF_error("PUBLIC or SYSTEM expected")) - in - if extend_dtd then begin - let no = new dtd_notation name xid config.encoding in - dtd # add_notation no - end - }} - ? {{ match !yy_position with - ("ws1"|"ws2"|"ws3") -> raise(WF_error("Whitespace is missing")) - | "name" -> raise(WF_error("Name expected")) - | "sys_or_public" -> raise(WF_error("SYSTEM or PUBLIC expected")) - | ("str1"|"str2") -> raise(WF_error("String literal expected")) - | "decl_rangle_entid" -> raise(WF_error("`>' expected")) - | _ -> raise(WF_error("Bad NOTATION declaration")) - }} - -/****************************** ELEMENTS **************************/ - -/* In the following rules, the number of error rules is reduced to - * improve the performance of the parser. - */ - - -contents_start(): - /* parses ... misc*, i.e. exactly one element followed - * optionally by white space or processing instructions. - * The element is entered into the global variables as follows: - * - If elstack is non-empty, the parsed element is added as new child to - * the top element of the stack. - * - If elstack is empty, the root_examplar object is modified rather than - * that a new element is created. If additionally the variable root is - * None, it is assigned Some root_examplar. - * Note that the modification of the root_exemplar is done by the method - * internal_init. - * The reason why the root element is modified rather than newly created - * is a typing requirement. It must be possible that the class of the root - * is derived from the original class element_impl, i.e. the user must be - * able to add additional methods. If we created a new root object, we - * would have to denote to which class the new object belongs; the root - * would always be an 'element_impl' object (and not a derived object). - * If we instead cloned an exemplar object and modified it by the - * "create" method, the root object would belong to the same class as the - * exemplar (good), but the type of the parsing function would always - * state that an 'element_impl' was created (because we can pass the new - * object only back via a global variable). The only solution is to - * modify the object that has been passed to the parsing function directly. - */ - $ {{ dtd <- transform_dtd dtd; }} - start_tag() content()* - {{ () }} - - -content(): - /* parses: start tags, end tags, content, or processing - * instructions. That the tags are properly nested is dynamically checked. - * As result, recognized elements are added to their parent elements, - * content is added to the element containing it, and processing instructions - * are entered into the element embracing them. (All as side-effects.) - */ - start_tag() - {{ () }} -| end_tag() - {{ () }} -| char_data() - {{ () }} -| cref() - {{ () }} -| pi() - {{ () }} -| entity_ref() - {{ () }} -| comment() - {{ () }} - - -entity_ref(): - Begin_entity eref_xmldecl_then_rest() - {{ if n_tags_open = 0 then - raise(WF_error("Entity reference not allowed here")) - }} - - -/* See comment for doc_mldecl_then_misc_then_prolog_then_rest. */ - -eref_xmldecl_then_rest(): - pl:PI_xml - $ {{ context.manager # current_entity # process_xmldecl pl; - }} - content()* End_entity - {{ () }} - -| $ {{ context.manager # current_entity # process_missing_xmldecl; }} - content() content()* End_entity - {{ () }} - -| $ {{ context.manager # current_entity # process_missing_xmldecl; }} - End_entity - {{ () }} - - -start_tag(): - /* parses or . - * - * EFFECT: If elstack is non-empty, the element is added to the - * top element of the stack as new child, and the element - * is pushed on the stack. If elstack is empty, the root_exemplar is - * modified and gets the parsed name and attribute list. The root_exemplar - * is pushed on the stack. If additionally the variable root is empty, too, - * this variable is initialized. - * If the form has been parsed, no element is pushed - * on the stack. - */ - tag: Tag_beg - $ {{ let position = - if config.store_element_positions then - Some(context.manager # position) - else - None - in - }} - ws: Ignore? Ignore* - attlist: attribute()* - emptiness: start_tag_rangle() - /* Note: it is guaranteed that there is whitespace between Tag_beg and - * the name of the first attribute, because there must be some separator. - * So we need not to check ws! - */ - {{ - let rec check_attlist al = - match al with - (nv1, num1) :: al' -> - if not num1 && al' <> [] then begin - match al with - ((n1,_),_) :: ((n2,_),_) :: _ -> - raise(WF_error("Whitespace is missing between attributes `" ^ - n1 ^ "' and `" ^ n2 ^ "'")) - | _ -> assert false - end; - check_attlist al' - | [] -> () - in - check_attlist attlist; - - let name, tag_beg_entid = tag in - let attlist' = List.map (fun (nv,_) -> nv) attlist in - let d = - create_element_node ?position:position spec dtd name attlist' in - - begin match id_index with - None -> () - | Some idx -> - (* Put the ID attribute into the index, if present *) - begin try - let v = d # id_attribute_value in (* may raise Not_found *) - idx # add v d (* may raise ID_not_unique *) - with - Not_found -> - (* No ID attribute *) - () - | ID_not_unique -> - (* There is already an ID with the same value *) - raise(Validation_error("ID not unique")) - end - end; - - if n_tags_open = 0 then begin - if root = None then begin - (* We have found the begin tag of the root element. *) - if config.enable_super_root_node then begin - (* The user wants the super root instead of the real root. - * The real root element becomes the child of the VR. - *) - (* Assertion: self # current is the super root *) - assert (self # current # node_type = T_super_root); - root <- Some (self # current); - self # current # add_node d; - doc # init_root (self # current); - end - else begin - (* Normal behaviour: The user wants to get the real root. *) - root <- Some d; - doc # init_root d; - end; - end - else - (* We have found a second topmost element. This is illegal. *) - raise(WF_error("Document must consist of only one toplevel element")) - end - else begin - (* We have found some inner begin tag. *) - self # save_data; (* Save outstanding data material first *) - self # current # add_node d - end; - - if emptiness then - (* An empty tag like . *) - d # local_validate ~use_dfa:config.validate_by_dfa () - else begin - (* A non-empty tag. *) - Stack.push (d, tag_beg_entid) elstack; - n_tags_open <- n_tags_open + 1; - end; - }} - ? {{ match !yy_position with - "attlist" -> raise(WF_error("Bad attribute list")) - | "emptiness" -> raise(WF_error("`>' or `/>' expected")) - | _ -> raise(WF_error("Bad start tag")) - }} - - -attribute(): - /* Parses name="value" */ - n:Name Ignore* Eq Ignore* v:attval() ws:Ignore? Ignore* - {{ (n,v), ws }} - - -attval(): - v:Attval - {{ expand_attvalue lexerset dtd v config.warner true }} -| v:Attval_nl_normalized - {{ expand_attvalue lexerset dtd v config.warner false }} - - -start_tag_rangle(): - Rangle {{ false }} -| Rangle_empty {{ true }} - - -end_tag(): - /* parses . - * Pops the top element from the elstack and checks if it is the same - * element. - */ - tag:Tag_end Ignore* Rangle - {{ let name, tag_end_entid = tag in - if n_tags_open = 0 then - raise(WF_error("End-tag without start-tag")); - - self # save_data; (* Save outstanding data material first *) - - let x, tag_beg_entid = Stack.pop elstack in - let x_name = - match x # node_type with - | T_element n -> n - | _ -> assert false - in - if name <> x_name then - raise(WF_error("End-tag does not match start-tag")); - if tag_beg_entid != tag_end_entid then - raise(WF_error("End-tag not in the same entity as the start-tag")); - x # local_validate ~use_dfa:config.validate_by_dfa (); - - n_tags_open <- n_tags_open - 1; - - assert (n_tags_open >= 0); - - }} - -char_data(): - /* Parses any literal characters not otherwise matching, and adds the - * characters to the top element of elstack. - * If elstack is empty, it is assumed that there is no surrounding - * element, and any non-white space character is forbidden. - */ - data:CharData - {{ - if n_tags_open = 0 then - (* only white space is allowed *) - self # only_whitespace data - else - self # collect_data data - (* We collect the chardata material until the next end tag is - * reached. Then the collected material will concatenated and - * stored as a single T_data node (see end_tag rule above) - * using save_data. - *) - }} -| data:Cdata - {{ - if n_tags_open = 0 then - raise (WF_error("CDATA section not allowed here")); - self # collect_data data - (* Also collect CDATA material *) - }} - -cref(): - /* Parses &#...; and adds the character to the top element of elstack. */ - code:CRef - {{ - if n_tags_open = 0 then - (* No surrounding element: character references are not allowed *) - raise(WF_error("Character reference not allowed here")); - self # collect_data (character config.encoding config.warner code) - (* Also collect character references *) - }} - -pi(): - /* Parses (but not ). - * If there is a top element in elstack, the processing instruction is added - * to this element. - */ - pi: PI - {{ - let position = - if config.store_element_positions then - Some(context.manager # position) - else - None - in - let target,value = pi in - - if n_tags_open = 0 & not config.enable_super_root_node - then - doc # add_pinstr (new proc_instruction target value config.encoding) - else begin - (* Special case: if processing instructions are processed inline, - * they are wrapped into T_pinstr nodes. - *) - if config.enable_pinstr_nodes then begin - self # save_data; (* Save outstanding data material first *) - let pinstr = new proc_instruction target value config.encoding in - let wrapper = create_pinstr_node - ?position:position spec dtd pinstr in - wrapper # local_validate(); (* succeeds always *) - self # current # add_node wrapper; - end - else - (* Normal behaviour: Add the PI to the parent element. *) - self # current # add_pinstr - (new proc_instruction target value config.encoding) - end - }} - - -comment(): - /* Parses - */ - Comment_begin - $ {{ - let position = - if config.enable_comment_nodes && config.store_element_positions then - Some(context.manager # position) - else - None - in - }} - mat: Comment_material* - ce: Comment_end - {{ - if config.enable_comment_nodes then begin - self # save_data; (* Save outstanding data material first *) - let comment_text = String.concat "" mat in - let wrapper = create_comment_node - ?position:position spec dtd comment_text in - wrapper # local_validate(); (* succeeds always *) - self # current # add_node wrapper; - end - }} - ? {{ match !yy_position with - | "ce" -> raise(WF_error("`-->' expected")) - | _ -> raise(WF_error("Bad comment")) - }} - - -%% - (* The method "parse" continues here... *) - - try - match start_symbol with - Ext_document -> - parse_ext_document context.current context.get_next - | Ext_declarations -> - parse_ext_declarations context.current context.get_next - | Ext_element -> - parse_ext_element context.current context.get_next - with - Not_found -> - raise Parsing.Parse_error - - (*********** The method "parse" ends here *************) - - -(**********************************************************************) - -(* Here ends the class definition: *) -end -;; - -(**********************************************************************) - -open Pxp_reader;; - - -class default_ext = - object(self) - val mutable node = (None : ('a extension node as 'a) option) - method clone = {< >} - method node = - match node with - None -> - assert false - | Some n -> n - method set_node n = - node <- Some n - end -;; - - -let default_extension = new default_ext;; - -let default_spec = - make_spec_from_mapping - ~super_root_exemplar: (new element_impl default_extension) - ~comment_exemplar: (new element_impl default_extension) - ~default_pinstr_exemplar: (new element_impl default_extension) - ~data_exemplar: (new data_impl default_extension) - ~default_element_exemplar: (new element_impl default_extension) - ~element_mapping: (Hashtbl.create 1) - () -;; - - -let idref_pass id_index root = - let error t att value = - let name = - match t # node_type with - T_element name -> name - | _ -> assert false - in - let text = - "Attribute `" ^ att ^ "' of element `" ^ name ^ - "' refers to unknown ID `" ^ value ^ "'" in - let pos_ent, pos_line, pos_col = t # position in - if pos_line = 0 then - raise(Validation_error text) - else - raise(At("In entity " ^ pos_ent ^ " at line " ^ - string_of_int pos_line ^ ", position " ^ string_of_int pos_col ^ - ":\n", - Validation_error text)) - in - - let rec check_tree t = - let idref_atts = t # idref_attribute_names in - List.iter - (fun att -> - match t # attribute att with - Value s -> - begin try ignore(id_index # find s) with - Not_found -> - error t att s - end - | Valuelist l -> - List.iter - (fun s -> - try ignore(id_index # find s) with - Not_found -> - error t att s - ) - l - | Implied_value -> () - ) - idref_atts; - List.iter check_tree (t # sub_nodes) - in - check_tree root -;; - - -exception Return_DTD of dtd;; - (* Used by extract_dtd_from_document_entity to jump out of the parser *) - - -let call_parser ~configuration:cfg - ~source:src - ~dtd - ~extensible_dtd - ~document:doc - ~specification:spec - ~process_xmldecl - ~transform_dtd - ~(id_index : 'ext #index option) - ~use_document_entity - ~entry - ~init_lexer = - let e = cfg.errors_with_line_numbers in - let w = cfg.warner in - let r, en = - match src with - Entity(m,r') -> r', m dtd - | ExtID(xid,r') -> r', - if use_document_entity then - new document_entity - r' dtd "[toplevel]" w xid e - cfg.encoding - else - new external_entity - r' dtd "[toplevel]" w xid false e - cfg.encoding - in - r # init_rep_encoding cfg.encoding; - r # init_warner w; - en # set_debugging_mode (cfg.debugging_mode); - let pobj = - new parser_object - doc - dtd - extensible_dtd - cfg - r - spec - process_xmldecl - transform_dtd - (id_index :> 'ext index option) - in - let mgr = new entity_manager en in - en # open_entity true init_lexer; - begin try - let context = make_context mgr in - pobj # parse context entry; - ignore(en # close_entity); - with - Return_DTD d -> - ignore(en # close_entity); - raise(Return_DTD d) - | error -> - ignore(en # close_entity); - r # close_all; - let pos = mgr # position_string in - raise (At(pos, error)) - end; - if cfg.idref_pass then begin - match id_index with - None -> () - | Some idx -> - ( match pobj # root with - None -> () - | Some root -> - idref_pass idx root; - ) - end; - pobj - - -let parse_dtd_entity cfg src = - (* Parse a DTD given as separate entity. *) - let dtd = new dtd cfg.warner cfg.encoding in - let doc = new document cfg.warner in - let pobj = - call_parser - ~configuration:cfg - ~source:src - ~dtd:dtd - ~extensible_dtd:true (* Extend the DTD by parsed declarations *) - ~document:doc - ~specification:default_spec - ~process_xmldecl:false (* The XML declaration is ignored - * (except 'encoding') - *) - ~transform_dtd:(fun x -> x) (* Do not transform the DTD *) - ~id_index: None - ~use_document_entity:false - ~entry:Ext_declarations (* Entry point of the grammar *) - ~init_lexer:Declaration (* The initially used lexer *) - in - dtd # validate; - if cfg.accept_only_deterministic_models then dtd # only_deterministic_models; - dtd -;; - - -let parse_content_entity ?id_index cfg src dtd spec = - (* Parse an element given as separate entity *) - dtd # validate; (* ensure that the DTD is valid *) - if cfg.accept_only_deterministic_models then dtd # only_deterministic_models; - let doc = new document cfg.warner in - let pobj = - call_parser - ~configuration:cfg - ~source:src - ~dtd:dtd - ~extensible_dtd:true (* Extend the DTD by parsed declarations *) - ~document:doc - ~specification:spec - ~process_xmldecl:false (* The XML declaration is ignored - * (except 'encoding') - *) - ~transform_dtd:(fun x -> x) (* Do not transform the DTD *) - ~id_index:(id_index :> 'ext index option) - ~use_document_entity:false - ~entry:Ext_element (* Entry point of the grammar *) - ~init_lexer:Content (* The initially used lexer *) - in - match pobj # root with - Some r -> r - | None -> raise(WF_error("No root element")) -;; - - -let parse_wfcontent_entity cfg src spec = - let dtd = new dtd cfg.warner cfg.encoding in - dtd # allow_arbitrary; - let doc = new document cfg.warner in - let pobj = - call_parser - ~configuration:cfg - ~source:src - ~dtd:dtd - ~extensible_dtd:false (* Do not extend the DTD *) - ~document:doc - ~specification:spec - ~process_xmldecl:false (* The XML declaration is ignored - * (except 'encoding') - *) - ~transform_dtd:(fun x -> x) (* Do not transform the DTD *) - ~id_index:None - ~use_document_entity:false - ~entry:Ext_element (* Entry point of the grammar *) - ~init_lexer:Content (* The initially used lexer *) - in - match pobj # root with - Some r -> r - | None -> raise(WF_error("No root element")) -;; - - -let iparse_document_entity ?(transform_dtd = (fun x -> x)) - ?id_index - cfg0 src spec p_wf = - (* Parse an element given as separate entity *) - (* p_wf: 'true' if in well-formedness mode, 'false' if in validating mode *) - let cfg = { cfg0 with - recognize_standalone_declaration = - cfg0.recognize_standalone_declaration && (not p_wf) - } in - let dtd = new dtd cfg.warner cfg.encoding in - if p_wf then - dtd # allow_arbitrary; - let doc = new document cfg.warner in - let pobj = - call_parser - ~configuration:cfg - ~source:src - ~dtd:dtd - ~extensible_dtd:(not p_wf) (* Extend the DTD by parsed declarations - * only if in validating mode - *) - ~document:doc - ~specification:spec - ~process_xmldecl:true (* The XML declaration is processed *) - (* TODO: change to 'not p_wf' ? *) - ~transform_dtd:(fun dtd -> - let dtd' = transform_dtd dtd in - if cfg.accept_only_deterministic_models then - dtd' # only_deterministic_models; - dtd') - - ~id_index:(id_index :> 'ext index option) - ~use_document_entity:true - ~entry:Ext_document (* Entry point of the grammar *) - ~init_lexer:Document (* The initially used lexer *) - in - pobj # doc -;; - - -let parse_document_entity ?(transform_dtd = (fun x -> x)) - ?id_index - cfg src spec = - iparse_document_entity - ~transform_dtd:transform_dtd - ?id_index:(id_index : 'ext #index option :> 'ext index option) - cfg src spec false;; - -let parse_wfdocument_entity cfg src spec = - iparse_document_entity cfg src spec true;; - -let extract_dtd_from_document_entity cfg src = - let transform_dtd dtd = raise (Return_DTD dtd) in - try - let doc = parse_document_entity - ~transform_dtd:transform_dtd - cfg - src - default_spec in - (* Should not happen: *) - doc # dtd - with - Return_DTD dtd -> - (* The normal case: *) - dtd -;; - - -let default_config = - let w = new drop_warnings in - { warner = w; - errors_with_line_numbers = true; - enable_pinstr_nodes = false; - enable_super_root_node = false; - enable_comment_nodes = false; - encoding = `Enc_iso88591; - recognize_standalone_declaration = true; - store_element_positions = true; - idref_pass = false; - validate_by_dfa = true; - accept_only_deterministic_models = true; - debugging_mode = false; - } - - -class [ 'ext ] hash_index = -object - constraint 'ext = 'ext node #extension - val ht = (Hashtbl.create 100 : (string, 'ext node) Hashtbl.t) - method add s n = - try - ignore(Hashtbl.find ht s); - raise ID_not_unique - with - Not_found -> - Hashtbl.add ht s n - - method find s = Hashtbl.find ht s - method index = ht -end - - -(* ====================================================================== - * History: - * - * $Log$ - * Revision 1.1 2000/11/17 09:57:29 lpadovan - * Initial revision - * - * Revision 1.14 2000/08/26 23:23:14 gerd - * Bug: from_file must not interpret the file name as URL path. - * Bug: When PI and comment nodes are generated, the collected data - * material must be saved first. - * - * Revision 1.13 2000/08/19 21:30:03 gerd - * Improved the error messages of the parser - * - * Revision 1.12 2000/08/18 20:16:25 gerd - * Implemented that Super root nodes, pinstr nodes and comment - * nodes are included into the document tree. - * - * Revision 1.11 2000/08/14 22:24:55 gerd - * Moved the module Pxp_encoding to the netstring package under - * the new name Netconversion. - * - * Revision 1.10 2000/07/23 02:16:33 gerd - * Support for DFAs. - * - * Revision 1.9 2000/07/14 13:57:29 gerd - * Added the id_index feature. - * - * Revision 1.8 2000/07/09 17:52:45 gerd - * New implementation for current_data. - * The position of elements is stored on demand. - * - * Revision 1.7 2000/07/09 01:00:35 gerd - * Improvement: It is now guaranteed that only one data node - * is added for consecutive character material. - * - * Revision 1.6 2000/07/08 16:27:29 gerd - * Cleaned up the functions calling the parser. - * New parser argument: transform_dtd. - * Implementations for 'extract_dtd_from_document_entity' and - * 'parse_wfcontent_entity'. - * - * Revision 1.5 2000/07/06 23:05:18 gerd - * Initializations of resolvers were missing. - * - * Revision 1.4 2000/07/06 22:11:01 gerd - * Fix: The creation of the non-virtual root element is protected - * in the same way as the virtual root element. - * - * Revision 1.3 2000/07/04 22:15:18 gerd - * Change: Using the new resolver capabilities. - * Still incomplete: the new extraction and parsing functions. - * - * Revision 1.2 2000/06/14 22:19:06 gerd - * Added checks such that it is impossible to mix encodings. - * - * Revision 1.1 2000/05/29 23:48:38 gerd - * Changed module names: - * Markup_aux into Pxp_aux - * Markup_codewriter into Pxp_codewriter - * Markup_document into Pxp_document - * Markup_dtd into Pxp_dtd - * Markup_entity into Pxp_entity - * Markup_lexer_types into Pxp_lexer_types - * Markup_reader into Pxp_reader - * Markup_types into Pxp_types - * Markup_yacc into Pxp_yacc - * See directory "compatibility" for (almost) compatible wrappers emulating - * Markup_document, Markup_dtd, Markup_reader, Markup_types, and Markup_yacc. - * - * ====================================================================== - * Old logs from markup_yacc.m2y: - * - * Revision 1.9 2000/05/29 21:14:57 gerd - * Changed the type 'encoding' into a polymorphic variant. - * - * Revision 1.8 2000/05/27 19:26:19 gerd - * Change: The XML declaration is interpreted right after - * it has been parsed (no longer after the document): new function - * check_and_parse_xmldecl. - * When elements, attributes, and entities are declared - * it is stored whether the declaration happens in an external - * entity (for the standalone check). - * The option recognize_standalone_declaration is interpreted. - * - * Revision 1.7 2000/05/20 20:31:40 gerd - * Big change: Added support for various encodings of the - * internal representation. - * - * Revision 1.6 2000/05/14 21:51:24 gerd - * Change: Whitespace is handled by the grammar, and no longer - * by the entity. - * - * Revision 1.5 2000/05/14 17:50:54 gerd - * Updates because of changes in the token type. - * - * Revision 1.4 2000/05/11 22:09:17 gerd - * Fixed the remaining problems with conditional sections. - * This seems to be also a weakness of the XML spec! - * - * Revision 1.3 2000/05/09 00:02:44 gerd - * Conditional sections are now recognized by the parser. - * There seem some open questions; see the TODO comments! - * - * Revision 1.2 2000/05/08 22:01:44 gerd - * Introduced entity managers (see markup_entity.ml). - * The XML declaration is now recognized by the parser. If such - * a declaration is found, the method process_xmldecl of the currently - * active entity is called. If the first token is not an XML declaration, - * the method process_missing_xmldecl is called instead. - * Some minor changes. - * - * Revision 1.1 2000/05/06 23:21:49 gerd - * Initial revision. - * - * - * ====================================================================== - * - * COPIED FROM REVISION 1.19 OF markup_yacc.mly - * - * Revision 1.19 2000/05/01 15:20:08 gerd - * "End tag matches start tag" is checked before "End tag in the - * same entity as start tag". - * - * Revision 1.18 2000/04/30 18:23:08 gerd - * Bigger change: Introduced the concept of virtual roots. First, - * this reduces the number of checks. Second, it makes it possible to - * return the virtual root to the caller instead of the real root (new - * config options 'virtual_root' and 'processing_instructions_inline'). - * Minor changes because of better CR/CRLF handling. - * - * Revision 1.17 2000/03/13 23:47:46 gerd - * Updated because of interface changes. (See markup_yacc_shadow.mli - * rev. 1.8) - * - * Revision 1.16 2000/01/20 20:54:43 gerd - * New config.errors_with_line_numbers. - * - * Revision 1.15 1999/12/17 22:27:58 gerd - * Bugfix: The value of 'p_internal_subset' (an instance - * variable of the parser object) is to true when the internal subset - * begins, and is set to false when this subset ends. The error was - * that references to external entities within this subset did not - * set 'p_internal_subset' to false; this is now corrected by introducing - * the 'p_internal_subset_stack'. - * This is a typical example of how the code gets more and - * more complicated and that it is very difficult to really understand - * what is going on. - * - * Revision 1.14 1999/11/09 22:23:37 gerd - * Removed the invocation of "init_dtd" of the root document. - * This method is no longer available. The DTD is also passed to the - * document object by the root element, so nothing essential changes. - * - * Revision 1.13 1999/10/25 23:37:09 gerd - * Bugfix: The warning "More than one ATTLIST declaration for element - * type ..." is only generated if an ATTLIST is found while there are already - * attributes for the element. - * - * Revision 1.12 1999/09/01 23:08:38 gerd - * New frontend function: parse_wf_document. This simply uses - * a DTD that allows anything, and by the new parameter "extend_dtd" it is - * avoided that element, attlist, and notation declarations are added to this - * DTD. The idea is that this function simulates a well-formedness parser. - * Tag_beg, Tag_end carry the entity_id. The "elstack" stores the - * entity_id of the stacked tag. This was necessary because otherwise there - * are some examples to produces incorrectly nested elements. - * p_internal_subset is a variable that stores whether the internal - * subset is being parsed. This is important beacause entity declarations in - * internal subsets are not allowed to contain parameter references. - * It is checked if the "elstack" is empty after all has been parsed. - * Processing instructions outside DTDs and outside elements are now - * added to the document. - * The rules of mixed and regexp style content models have been - * separated. The code is now much simpler. - * Entity references outside elements are detected and rejected. - * - * Revision 1.11 1999/09/01 16:26:08 gerd - * Improved the quality of error messages. - * - * Revision 1.10 1999/08/31 19:13:31 gerd - * Added checks on proper PE nesting. The idea is that tokens such - * as Decl_element and Decl_rangle carry an entity ID with them. This ID - * is simply an object of type < >, i.e. you can only test on identity. - * The lexer always produces tokens with a dummy ID because it does not - * know which entity is the current one. The entity layer replaces the dummy - * ID with the actual ID. The parser checks that the IDs of pairs such as - * Decl_element and Decl_rangle are the same; otherwise a Validation_error - * is produced. - * - * Revision 1.9 1999/08/15 20:42:01 gerd - * Corrected a misleading message. - * - * Revision 1.8 1999/08/15 20:37:34 gerd - * Improved error messages. - * Bugfix: While parsing document entities, the subclass document_entity is - * now used instead of external_entity. The rules in document entities are a bit - * stronger. - * - * Revision 1.7 1999/08/15 14:03:59 gerd - * Empty documents are not allowed. - * "CDATA section not allowed here" is a WF_error, not a Validation_ - * error. - * - * Revision 1.6 1999/08/15 02:24:19 gerd - * Removed some grammar rules that were used for testing. - * Documents without DTD can now have arbitrary elements (formerly - * they were not allowed to have any element). - * - * Revision 1.5 1999/08/14 22:57:20 gerd - * It is allowed that external entities are empty because the - * empty string is well-parsed for both declarations and contents. Empty - * entities can be referenced anywhere because the references are replaced - * by nothing. Because of this, the Begin_entity...End_entity brace is only - * inserted if the entity is non-empty. (Otherwise references to empty - * entities would not be allowed anywhere.) - * As a consequence, the grammar has been changed such that a - * single Eof is equivalent to Begin_entity,End_entity without content. - * - * Revision 1.4 1999/08/14 22:20:01 gerd - * The "config" slot has now a component "warner" which is - * an object with a "warn" method. This is used to warn about characters - * that cannot be represented in the Latin 1 alphabet. - * Furthermore, there is a new component "debugging_mode". - * Some Parse_error exceptions have been changed into Validation_error. - * The interfaces of functions/classes imported from other modules - * have changed; the invocations have been adapted. - * Contents may contain CDATA sections that have been forgotten. - * - * Revision 1.3 1999/08/11 15:00:41 gerd - * The Begin_entity ... End_entity brace is also possible in - * 'contents'. - * The configuration passed to the parsing object contains always - * the resolver that is actually used. - * - * Revision 1.2 1999/08/10 21:35:12 gerd - * The XML/encoding declaration at the beginning of entities is - * evaluated. In particular, entities have now a method "xml_declaration" - * which returns the name/value pairs of such a declaration. The "encoding" - * setting is interpreted by the entity itself; "version", and "standalone" - * are interpreted by Markup_yacc.parse_document_entity. Other settings - * are ignored (this does not conform to the standard; the standard prescribes - * that "version" MUST be given in the declaration of document; "standalone" - * and "encoding" CAN be declared; no other settings are allowed). - * TODO: The user should be warned if the standard is not exactly - * fulfilled. -- The "standalone" property is not checked yet. - * - * Revision 1.1 1999/08/10 00:35:52 gerd - * Initial revision. - * - * - *)