helm/DEVEL/pxp/pxp/pxp_yacc.m2y

   1 (* $Id$ -*- tuareg -*-
   2  * ----------------------------------------------------------------------
   3  * PXP: The polymorphic XML parser for Objective Caml.
   4  * Copyright by Gerd Stolpmann. See LICENSE for details.
   5  *)
   6
   7 open Parsing
   8 open Pxp_types
   9 open Pxp_lexer_types
  10 open Pxp_dtd
  11 open Pxp_entity
  12 open Pxp_document
  13 open Pxp_aux
  14
  15 (* Some types from the interface definition: *)
  16
  17 exception ID_not_unique
  18
  19 class type [ 'ext ] index =
  20 object
  21   constraint 'ext = 'ext node #extension
  22   method add : string -> 'ext node -> unit
  23   method find : string -> 'ext node
  24 end
  25
  26
  27 type config =
  28     { warner : collect_warnings;
  29       errors_with_line_numbers : bool;
  30       enable_pinstr_nodes : bool;
  31       enable_super_root_node : bool;
  32       enable_comment_nodes : bool;
  33       encoding : rep_encoding;
  34       recognize_standalone_declaration : bool;
  35       store_element_positions : bool;
  36       idref_pass : bool;
  37       validate_by_dfa : bool;
  38       accept_only_deterministic_models : bool;
  39       debugging_mode : bool;
  40     }
  41
  42 type source =
  43     Entity of ((dtd -> Pxp_entity.entity) * Pxp_reader.resolver)
  44   | ExtID of (ext_id * Pxp_reader.resolver)
  45
  46
  47 type start_symbol =
  48     Ext_document
  49   | Ext_declarations
  50   | Ext_element
  51
  52
  53 type context =
  54     { mutable current : unit -> token;  (* get the current token *)
  55       mutable get_next : unit -> token; (* go on to the next token; return it *)
  56       mutable current_token : token;    (* This is the current token *)
  57       mutable manager : entity_manager; (* The entity manager *)
  58     }
  59
  60
  61 let make_context entity_manager =
  62   let c =
  63     { current = (fun _ -> assert false);
  64       get_next = (fun _ -> assert false);
  65       current_token = Eof;
  66       manager = entity_manager;
  67     }
  68   in
  69   (* Note that the function which is stored in get_next_ref can be changed
  70    * as a side-effect when an entity is opened or closed. The function in
  71    * c.get_next must be programmed such that always the current "get_next"
  72    * function is executed.
  73    *)
  74   let get_next_ref = entity_manager # yy_get_next_ref in
  75   c.current  <- (fun () -> c.current_token);
  76   c.get_next <- (fun () -> let tok = !get_next_ref() in
  77                            c.current_token <- tok;
  78                            tok);
  79   ignore(c.get_next());
  80   c
  81 ;;
  82
  83
  84 let from_channel ?system_encoding ?id:init_id ?fixenc ch =
  85
  86   (* Reading from a channel works by modifying the algorithm of
  87    * resolve_as_file.
  88    *)
  89
  90   let url_syntax =      (* A syntax suitable for "file" URLs *)
  91     { Neturl.null_url_syntax with
  92         Neturl.url_enable_scheme = Neturl.Url_part_allowed;
  93         Neturl.url_enable_host   = Neturl.Url_part_allowed;
  94         Neturl.url_enable_path   = Neturl.Url_part_required;
  95         Neturl.url_accepts_8bits = true;
  96     }
  97   in
  98
  99   let an_url =
 100     Neturl.make_url
 101       ~scheme: "file"
 102       ~host:   ""
 103       ~path:   [ "" ]
 104       url_syntax
 105   in
 106
 107   let init_channel_done = ref false in
 108     (* Whether the first access to this source has already happened. *)
 109
 110   (* The task of url_of_id is:
 111    * - When it is called the first time, and no init_id is present,
 112    *   the URL file:/// is passed back (an_url). This forces that
 113    *   absolute path names /path/dir/... will be interpreted as
 114    *   file path names. (But relative path names will not work.)
 115    * - If an init_id has been passed, we can assume that the opened URL
 116    *   is exactly this init_id. By raising Not_competent it is indicated
 117    *   that the standard method is to be used for the interpretation of
 118    *   the URL.
 119    * - Otherwise, the channel is already being read, and thus cannot again
 120    *   opened. (This case is handled in channel_of_url.)
 121    *)
 122
 123   let url_of_id xid =
 124     if !init_channel_done then begin
 125       (* Use the normal way of determining the URL of the ID: *)
 126       raise Pxp_reader.Not_competent
 127     end
 128     else begin
 129       match init_id with
 130           None ->
 131             an_url
 132               (* If the channel is not associated with any URL: Simply pass
 133                * the URL file:/// back.
 134                *)
 135         | Some the_init_id ->
 136             assert (the_init_id = xid);
 137             raise Pxp_reader.Not_competent
 138               (* If the channel is associated with a URL, the corresponding
 139                * ID must be passed when the first invocation happens.
 140                *)
 141     end
 142   in
 143
 144   (* The task of channel_of_url:
 145    * - If it is called the first time ("else"), the channel is returned
 146    * - Otherwise, the channel is already being read, and thus cannot again
 147    *   opened. By raising Not_competent it is signaled that the
 148    *   resolve_as_file object must not continue to open the URL.
 149    *)
 150
 151   let channel_of_url url =
 152     if !init_channel_done then
 153       raise Pxp_reader.Not_competent
 154     else begin
 155       init_channel_done := true;
 156       ch, fixenc
 157     end
 158   in
 159
 160   let r =
 161     new Pxp_reader.resolve_as_file
 162       ?system_encoding:system_encoding
 163       ~url_of_id:url_of_id
 164       ~channel_of_url:channel_of_url
 165       ()
 166   in
 167
 168   let init_xid =
 169     match init_id with
 170         None   -> Anonymous
 171       | Some id ->
 172           (* Note: 'id' may be illegal (malformed); in this case, the first
 173            * invocation of url_of_id will raise Not_competent, and the 'open_in'
 174            * method will fail.
 175            *)
 176           id
 177   in
 178
 179   ExtID(init_xid, r)
 180 ;;
 181
 182
 183 let from_file ?system_encoding utf8_filename =
 184
 185   let r =
 186     new Pxp_reader.resolve_as_file
 187       ?system_encoding:system_encoding
 188       ()
 189   in
 190
 191   let utf8_abs_filename =
 192     if utf8_filename <> "" && utf8_filename.[0] = '/' then
 193       utf8_filename
 194     else
 195       Sys.getcwd() ^ "/" ^ utf8_filename
 196   in
 197
 198   let syntax = { Neturl.ip_url_syntax with Neturl.url_accepts_8bits = true } in
 199   let url = Neturl.make_url
 200               ~scheme:"file"
 201               ~host:"localhost"
 202               ~path:(Neturl.split_path utf8_abs_filename)
 203               syntax
 204   in
 205
 206   let xid = System (Neturl.string_of_url url) in
 207
 208
 209   ExtID(xid, r)
 210 ;;
 211
 212
 213 let from_string ?fixenc s =
 214   let r =
 215     new Pxp_reader.resolve_read_this_string ?fixenc:fixenc s in
 216   ExtID(Anonymous, r)
 217 ;;
 218
 219
 220 (**********************************************************************)
 221
 222 class ['ext] parser_object
 223   init_doc init_dtd init_extend_dtd init_config init_resolver init_spec
 224   init_process_xmldecl transform_dtd id_index
 225   =
 226   object (self)
 227
 228       (* Note that the 'ext parameter has been the motivation to make the
 229        * parser a class.
 230        *)
 231
 232     val mutable dtd = init_dtd
 233         (* The DTD being parsed; or the DTD currently assumed *)
 234
 235     val extend_dtd = init_extend_dtd
 236         (* Whether the DTD should be extended by ELEMENT, ATTLIST, and
 237          * NOTATION declarations or not. (True for validating mode,
 238          * false for well-formedness mode.)
 239          *)
 240
 241     val transform_dtd = transform_dtd
 242         (* A function transforming the DTD *)
 243
 244     val id_index = (id_index : 'ext index option)
 245         (* The ID index or None *)
 246
 247     val process_xmldecl = init_process_xmldecl
 248         (* Whether the XML declaration is parsed and the found XML version
 249          * and standalone declaration are passed to 'doc'.
 250          *)
 251
 252     val lexerset = Pxp_lexers.get_lexer_set (init_config.encoding)
 253
 254     val doc = init_doc
 255         (* The current document *)
 256
 257     method doc = (doc : 'ext document)
 258
 259     val resolver = init_resolver
 260         (* The resolver for external IDs *)
 261
 262     val config = init_config
 263         (* The current configuration *)
 264
 265     val elstack = (Stack.create() : ('ext node * entity_id) Stack.t)
 266        (* The element stack containing all open elements, i.e. elements that
 267         * have begun by a start tag but that have not been finished (end tag).
 268         * If the parser sees a start tag, it creates the element and pushes it
 269         * on top of this stack. If the parser recognizes an end tag, it pulls
 270         * one element from the stack and checks if it has the same name as
 271         * given with the end tag.
 272         *
 273         * At initialization time, a special element is pushed on the stack,
 274         * the so-called super root. It is always the bottommost
 275         * element of the stack, and serves as a guard.
 276         * [See "initializer" below.]
 277         *)
 278
 279     method current =
 280         (* Get the top element of the element stack *)
 281         try
 282           fst(Stack.top elstack)
 283         with
 284             Stack.Empty -> assert false
 285                 (* Not possible, because the super root is always the element
 286                  * at the bottom of the stack.
 287                  *)
 288
 289     val mutable n_tags_open = 0
 290         (* Number of begin tags that have been parsed and whose corresponding
 291          * end tags have not yet been parsed
 292          *)
 293
 294     val mutable p_internal_subset = false
 295         (* true while parsing the internal subset - there are some additional
 296          * constraints for internal subsets, and because of this it must
 297          * be known whether the current declaration is contained in the
 298          * internal or external subset of the DTD.
 299          *)
 300
 301     val mutable root = None
 302         (* Contains the root element (topmost element) while it is being parsed
 303          * and after it has been parsed.
 304          * This variable is None before the root element is seen.
 305          *)
 306
 307     method root = root
 308
 309     val spec = init_spec
 310         (* A hashtable that contains exemplar objects for the various element
 311          * types. If an element is parsed, the exemplar is looked up and
 312          * "cloned" (by the "create" method)
 313          *)
 314
 315     val mutable current_data = []
 316         (* Collects character data. *)
 317
 318     method collect_data s =
 319         (* Collects the character material 's' *)
 320         current_data <- s :: current_data
 321
 322     method save_data =
 323       (* Puts the material collected in 'current_data' into a new
 324        * node, and appends this node as new sub node to 'current'
 325        *)
 326       match current_data with
 327           [] ->
 328             ()
 329         | [ str ] ->
 330             if str <> "" then
 331               self # current # add_node (create_data_node spec dtd str);
 332             current_data <- []
 333         | _ ->
 334             let count = List.fold_left
 335                           (fun acc s -> acc + String.length s)
 336                           0
 337                           current_data in
 338             let str = String.create count in
 339             let pos = ref count in
 340             List.iter
 341               (fun s ->
 342                  let l = String.length s in
 343                  pos := !pos - l;
 344                  String.blit
 345                  ~src:s
 346                  ~src_pos:0
 347                  ~dst:str
 348                  ~dst_pos:(!pos)
 349                  ~len:l
 350               )
 351               current_data;
 352             assert(!pos = 0);
 353             if str <> "" then
 354               self # current # add_node (create_data_node spec dtd str);
 355             current_data <- []
 356
 357
 358     method only_whitespace data =
 359         (* Checks that the string "data" contains only whitespace. On failure,
 360          * Validation_error is raised.
 361          *)
 362       let lexbuf = Lexing.from_string data in
 363       let t1 = lexerset.scan_name_string lexbuf in
 364       if t1 <> Ignore then
 365         raise(WF_error("Data not allowed here"));
 366       let t2 = lexerset.scan_name_string lexbuf in
 367       if t2 <> Eof then
 368         raise(WF_error("Data not allowed here"));
 369       ()
 370
 371     initializer
 372       (* CHECKS: *)
 373       if config.encoding <> dtd # encoding then
 374         failwith("Encoding mismatch");
 375
 376       (* --- Initialize 'elstack': Push the super-root on the stack. *)
 377       let super_root =
 378         if config.enable_super_root_node then
 379           create_super_root_node spec dtd
 380         else
 381           (* because spec may not contain an exemplar for the super root: *)
 382           create_no_node spec dtd
 383       in
 384       (* Move the super root or the emulation to the stack: *)
 385       Stack.push (super_root, (self :> entity_id)) elstack;
 386
 387
 388
 389       (********* Here the method "parse" begins. The grammar below is
 390        *         transformed to a local function of this method
 391        *)
 392
 393       method parse context start_symbol =
 394
 395         let parse_ignored_section yy_current yy_get_next =
 396           (* A special parser which should be used after <![IGNORE[.
 397            * It parses until the corresponding ]]> is found.
 398            *)
 399
 400           while yy_current() = Ignore do
 401             ignore(yy_get_next());
 402           done;
 403
 404           ( match yy_current() with
 405               Conditional_body _ -> ()
 406             | _                  -> raise Parsing.Parse_error;
 407           );
 408
 409           let en = context.manager # current_entity in
 410           let llev = ref 1 in
 411           while !llev >= 1 do
 412             let igntok = en # next_ignored_token in
 413             (* next_ignored_token: uses a special lexer that only
 414              * recognizes Conditional_begin and Conditional_end;
 415              * other character combinations are ignored.
 416              *)
 417             (* NOTE: next_ignored_token works much like yy_get_next,
 418              * but it does not set the current token!
 419              *)
 420             match igntok with
 421                 Conditional_begin _ ->
 422                   llev := !llev + 1
 423               | Conditional_end _ ->
 424                   llev := !llev - 1;
 425                   (* Because the loop may be exited now: *)
 426                   context.current_token <- igntok;
 427               | (End_entity | Eof) ->
 428                   raise Parsing.Parse_error
 429               | _ ->
 430                   ()
 431           done;
 432
 433         in
 434
 435
 436         let check_and_parse_xmldecl xmldecl =
 437           if process_xmldecl then begin
 438             let v, _, s = decode_doc_xml_pi (decode_xml_pi xmldecl) in
 439             check_version_num v;
 440             doc # init_xml_version v;
 441             let v = match s with
 442                 None -> false
 443               | Some "yes" -> true
 444               | Some "no" -> false
 445               | _ -> raise (WF_error("Illegal 'standalone' declaration"))
 446             in
 447             if config.recognize_standalone_declaration then
 448               dtd # set_standalone_declaration v
 449           end
 450         in
 451
 452         let recode_utf8 s =
 453           (* Recode 's' to UTF-8 *)
 454           if config.encoding = `Enc_utf8 then
 455             s   (* No recoding necessary *)
 456           else
 457             Netconversion.recode_string
 458               ~in_enc:(config.encoding :> encoding) ~out_enc:`Enc_utf8 s
 459         in
 460
 461
 462 %%
 463
 464 /* The following grammar looks similar to ocamlyacc grammars, but
 465  * ocamlyacc is actually not used to transform the grammar into a parser.
 466  * Instead, the parser generator m2parsergen is applied.
 467  *
 468  * The format of the grammar is different (see m2parsergen/README),
 469  * but I hope that you can understand most features immediately.
 470  *
 471  * The type of the parser is different: m2parsergen creates a top-down
 472  * parser while ocamlyacc generates a LALR-1 parser.
 473  *
 474  * The way the generated code is called is different: ocamlyacc produces
 475  * lots of top-level definitions whereas m2parsergen generates only
 476  * a local let-in-phrase. This is explained in the already mentioned
 477  * README file.
 478  */
 479
 480 /* See Pxp_types.ml for comments to the various tokens */
 481
 482 %token Begin_entity
 483 %token End_entity
 484 %token Comment_begin
 485 %token Comment_end
 486 %token Ignore
 487 %token Eq
 488 %token Rangle
 489 %token Rangle_empty
 490 %token <> Conditional_begin
 491 %token <> Conditional_body
 492 %token <> Conditional_end
 493 %token Percent
 494 %token Plus
 495 %token Star
 496 %token Bar
 497 %token Comma
 498 %token Qmark
 499 %token Pcdata
 500 %token Required
 501 %token Implied
 502 %token Fixed
 503 %token Eof
 504
 505 %token <> Comment_material
 506 %token <> Doctype
 507 %token <> Doctype_rangle
 508 %token <> Dtd_begin
 509 %token <> Dtd_end
 510 %token <> Decl_element
 511 %token <> Decl_attlist
 512 %token <> Decl_entity
 513 %token <> Decl_notation
 514 %token <> Decl_rangle
 515 %token <> Lparen
 516 %token <> Rparen
 517 %token <> RparenPlus
 518 %token <> RparenStar
 519 %token <> RparenQmark
 520
 521 %token <> Tag_beg
 522 %token <> Tag_end
 523
 524 %token <> PI
 525 %token <> PI_xml
 526 %token <> Cdata
 527 %token <> CRef
 528 %token <> ERef
 529 %token <> PERef
 530 %token <> CharData
 531 %token <> LineEnd
 532 %token <> Name
 533 %token <> Nametoken
 534 %token <> Attval
 535 %token <> Attval_nl_normalized
 536 %token <> Unparsed_string
 537
 538 /* START SYMBOLS:
 539  *
 540  * "ext_document":       parses a complete XML document (i.e. containing a
 541  *                       <!DOCTYPE..> and an element)
 542  * "ext_declarations":   parses an "external DTD subset", i.e. a sequence
 543  *                       of declarations
 544  * "ext_element":        parses a single element (no <!DOCTYPE...> allowed);
 545  *                       the element needs not to be the root element of the
 546  *                       DTD
 547  *
 548  * The functions corresponding to these symbols return always () because
 549  * they only have side-effects.
 550  */
 551
 552 /* SOME GENERAL COMMENTS:
 553  *
 554  * The parser does not get its tokens from the lexers directly. Instead of
 555  * this, there is an entity object between the parser and the lexers. This
 556  * object already handles:
 557  *
 558  * - References to general and parameter entities. The token stream is
 559  *   modified such that tokens automatically come from the referenced entities.
 560  *   External parameter entities and all general entities are embraced by
 561  *   the two special tokens Begin_entity and End_entity. The parser must
 562  *   check that these braces are correctly nested.
 563  */
 564
 565 %%
 566
 567
 568 ext_document():
 569   Begin_entity
 570   doc_xmldecl_then_misc_then_prolog_then_rest() End_entity
 571     {{
 572       if n_tags_open <> 0 then
 573         raise(WF_error("Missing end tag"))
 574     }}
 575
 576
 577 /* In the following rule, we must find out whether there is an XML declaration
 578  * or not, and directly after that either "process_xmldecl" or
 579  * "process_missing_xmldecl" of the current entity must be called.
 580  * AND IT MUST BE DIRECTLY! Because of this, the invocation is carried out
 581  * in the "$" clause immediately following the first token.
 582  *
 583  * TODO: This is not enough. The first token may be a tag, and the tag
 584  * may already contain non-ASCII characters. (But in this case, the resolvers
 585  * assume UTF8, and they are right...)
 586  */
 587
 588 doc_xmldecl_then_misc_then_prolog_then_rest():
 589   pl:PI_xml
 590   $ {{ context.manager # current_entity # process_xmldecl pl;
 591        check_and_parse_xmldecl pl;
 592     }}
 593   misc()* doc_prolog_then_rest()
 594     {{ () }}
 595
 596 | $ {{ context.manager # current_entity # process_missing_xmldecl; }}
 597   misc() misc()* doc_prolog_then_rest()
 598     {{ () }}
 599
 600 | $ {{ context.manager # current_entity # process_missing_xmldecl; }}
 601   doctypedecl() misc()* contents_start()
 602     {{ () }}
 603
 604 | $ {{ context.manager # current_entity # process_missing_xmldecl; }}
 605   contents_start()
 606     {{ () }}
 607
 608
 609 doc_prolog_then_rest():
 610   doctypedecl() misc()* contents_start()
 611     {{ () }}
 612 | contents_start()
 613     {{ () }}
 614
 615
 616 ext_element():
 617   Begin_entity el_xmldecl_then_misc_then_rest() End_entity
 618     {{
 619       if n_tags_open <> 0 then
 620         raise(WF_error("Missing end tag"))
 621     }}
 622
 623
 624 /* See comment for doc_mldecl_then_misc_then_prolog_then_rest. */
 625
 626 el_xmldecl_then_misc_then_rest():
 627   pl:PI_xml
 628   $ {{ context.manager # current_entity # process_xmldecl pl; }}
 629   misc()* contents_start()
 630     {{ () }}
 631
 632 | $ {{ context.manager # current_entity # process_missing_xmldecl; }}
 633   misc() misc()* contents_start()
 634     {{ () }}
 635
 636 | $ {{ context.manager # current_entity # process_missing_xmldecl; }}
 637   contents_start()
 638     {{ () }}
 639
 640
 641 ext_declarations():
 642   /* Parses a sequence of declarations given by an entity. As side-effect,
 643    * the parsed declarations are put into the dtd object.
 644    */
 645   Begin_entity decl_xmldecl_then_rest()
 646    {{ () }}
 647 | Eof
 648    {{ () }}
 649
 650
 651 decl_xmldecl_then_rest():
 652   /* Note: This rule is also called from declaration()! */
 653   pl:PI_xml
 654   $ {{ context.manager # current_entity # process_xmldecl pl;
 655     }}
 656   declaration()* End_entity
 657    {{ () }}
 658
 659 | $ {{ context.manager # current_entity # process_missing_xmldecl; }}
 660   declaration() declaration()* End_entity
 661    {{ () }}
 662
 663 | $ {{ context.manager # current_entity # process_missing_xmldecl; }}
 664   End_entity
 665    {{ () }}
 666
 667
 668 misc():
 669   pi()
 670     {{ () }}
 671 | data: CharData
 672     /* In this context, the lexers sometimes do not recognize white space;
 673      * instead CharData tokens containing white space are delivered.
 674      */
 675     {{ self # only_whitespace data }}
 676 | Ignore
 677     {{ () }}
 678 | comment()
 679     {{ () }}
 680
 681
 682 /********************* DOCUMENT TYPE DECLARATION *************************/
 683
 684 doctypedecl():
 685   /* parses from <!DOCTYPE to >. As side-effect, first the declarations of
 686    * the internal DTD (if any) are put into !!on_dtd, then the declarations
 687    * of the external DTD (if any) are put into this DTD object.
 688    */
 689   doctype_entid:  Doctype
 690              ws:  Ignore Ignore*
 691                   doctypedecl_material (doctype_entid)
 692     {{ () }}
 693   ? {{ match !yy_position with
 694            "ws" -> raise(WF_error("Whitespace is missing after `DOCTYPE'"))
 695          | _    -> raise(WF_error("Bad DOCTYPE declaration"))
 696      }}
 697
 698
 699 /* TRICK:
 700  *   ws: Ignore? Ignore*
 701  * is meant seriously. The effect is that ws becomes a boolean variable
 702  * which is true if there is an Ignore token and false otherwise.
 703  * This construct is faster than just
 704  *   ws: Ignore*
 705  * in which case ws becomes an integer variable containing the number of
 706  * Ignore tokens. Counting the number of tokens is slower than only checking
 707  * the existence.
 708  *
 709  * We need the information whether there is an Ignore token (representing
 710  * white space), because white space is only obligatory if also an identifier
 711  * for the external subset is parsed; this conditional syntax constraint is
 712  * simply programmed in the body of the grammar rule.
 713  */
 714
 715 doctypedecl_material(doctype_entid):
 716   root_name:             Name
 717   ws:                    Ignore? Ignore*
 718   external_subset:       external_id()?
 719                          Ignore*
 720   internal_subset:       internal_dtd()?
 721                          Ignore*
 722   doctype_rangle_entid:  Doctype_rangle
 723     {{
 724       if doctype_entid != doctype_rangle_entid then
 725         raise (Validation_error("Entities not properly nested with DOCTYPE declaration"));
 726       dtd # set_root root_name;
 727       begin match external_subset, internal_subset with
 728           None, None      -> ()         (* no DTD means no ID *)
 729         | None, Some _    -> dtd # set_id Internal
 730         | Some id, None   -> dtd # set_id (External id)
 731         | Some id, Some _ -> dtd # set_id (Derived id)
 732       end;
 733       (* Get now the external doctype declaration. Note that the internal
 734        * subset has precedence and must be read first.
 735        *)
 736       begin match external_subset with
 737           None -> ()
 738         | Some id ->
 739             if not ws then
 740               raise(WF_error("Whitespace is missing after `DOCTYPE " ^
 741                              root_name ^ "'"));
 742             let r' = resolver # clone in
 743             let pobj =
 744               new parser_object
 745                 (new document config.warner)
 746                 dtd
 747                 extend_dtd
 748                 config
 749                 r'
 750                 spec
 751                 process_xmldecl
 752                 (fun x -> x)
 753                 None
 754             in
 755             let en = new external_entity r' dtd "[dtd]"
 756                          config.warner id false config.errors_with_line_numbers
 757                          config.encoding
 758             in
 759             en # set_debugging_mode (config.debugging_mode);
 760             let mgr = new entity_manager en in
 761             en # open_entity true Declaration;
 762             try
 763               let context = make_context mgr in
 764               pobj # parse context Ext_declarations;
 765               ignore(en # close_entity);
 766             with
 767                 error ->
 768                   ignore(en # close_entity);
 769                   r' # close_all;
 770                   let pos = mgr # position_string in
 771                   raise (At(pos, error))
 772       end;
 773       dtd # validate
 774     }}
 775   ? {{
 776        match !yy_position with
 777            "doctype_rangle_entid" -> raise(WF_error("`>' expected"))
 778          | _                      -> raise(WF_error("Bad DOCTYPE declaration"))
 779     }}
 780
 781 /* Note that there are no keywords for SYSTEM or PUBLIC, as these would
 782  * be difficult to recognize in the lexical contexts. Because of this,
 783  * SYSTEM/PUBLIC is parsed as name, and the rule for everything after
 784  * SYSTEM/PUBLIC is computed dynamically.
 785  */
 786
 787 external_id():
 788   tok:Name
 789   $ {{
 790        let followup =
 791          match tok with
 792              "SYSTEM" -> parse_system_id
 793                            (* Apply the rule system_id (below) to parse the
 794                             * rest of the ID
 795                             *)
 796            | "PUBLIC" -> parse_public_id
 797                            (* Apply the rule public_id (below) to parse the
 798                             * rest of the ID
 799                             *)
 800            | _        -> raise(WF_error("SYSTEM or PUBLIC expected"))
 801        in
 802      }}
 803   ws:Ignore Ignore*
 804   r:[followup]()
 805     {{ r }}
 806   ? {{ match !yy_position with
 807            "ws" -> raise(WF_error("Whitespace is missing after " ^ tok))
 808          | _    -> raise(WF_error("Bad SYSTEM or PUBLIC identifier"))
 809     }}
 810
 811
 812 system_id():
 813   str:Unparsed_string
 814     {{ System (recode_utf8 str) }}
 815
 816
 817 public_id():
 818   str1: Unparsed_string
 819     ws: Ignore Ignore*
 820   str2: Unparsed_string
 821     {{ check_public_id str1;
 822        Public(recode_utf8 str1, recode_utf8 str2)
 823     }}
 824   ? {{ match !yy_position with
 825            "ws" -> raise(WF_error("Whitespace is missing between the literals of the PUBLIC identifier"))
 826          | _    -> raise(WF_error("Bad PUBLIC identifier"))
 827     }}
 828
 829
 830 /* The internal subset: "[" declaration* "]". While parsing the declarations
 831  * the object variable p_internal_subset must be true; however, if there
 832  * are entity references, this variable must be reset to false during
 833  * the entity. (See the rule for "declaration" below.)
 834  */
 835
 836 internal_dtd():
 837   dtd_begin_entid:    internal_dtd_begin()
 838                       declaration()*
 839   dtd_end_entid:      internal_dtd_end()
 840     {{
 841       if dtd_begin_entid != dtd_end_entid then
 842         raise(Validation_error("Entities not properly nested with internal DTD subset"))
 843     }}
 844   ? {{ match !yy_position with
 845            "dtd_end_entid" -> raise(WF_error("`]' expected"))
 846          | _               -> raise(WF_error("Bad internal DTD subset"))
 847     }}
 848
 849
 850 internal_dtd_begin():
 851   Dtd_begin
 852     {{ assert (not p_internal_subset);
 853        p_internal_subset <- true }}
 854
 855
 856 internal_dtd_end():
 857   Dtd_end
 858     {{ assert p_internal_subset;
 859        p_internal_subset <- false }}
 860
 861
 862 declaration():
 863   /* Parses a single declaration (or processing instruction). As side-effect
 864    * the parsed declaration is stored into the dtd object.
 865    */
 866   elementdecl()
 867     {{ () }}
 868 | attlistdecl()
 869     {{ () }}
 870 | entid:Decl_entity ws:Ignore Ignore* e:entitydecl(entid)
 871     {{ () }}
 872   ? {{ match !yy_position with
 873            "ws" -> raise(WF_error("Whitespace is missing after ENTITY"))
 874          | "e"  -> raise(WF_error("Name or `%' expected"))
 875          | _    -> raise(WF_error("Bad entity declaration"))
 876     }}
 877 | notationdecl()
 878     {{ () }}
 879 | pi: PI
 880     {{ let target, value = pi in
 881        let pi = new proc_instruction target value config.encoding in
 882        dtd # add_pinstr pi
 883     }}
 884 | Ignore
 885     {{ () }}
 886 | Comment_begin Comment_material* ce:Comment_end
 887     {{ () }}
 888   ? {{ match !yy_position with
 889            "ce" -> raise(WF_error("`-->' expected"))
 890          | _    -> raise(WF_error("Bad comment"))
 891     }}
 892 | Begin_entity
 893   $ {{ (* Set 'p_internal_subset' to 'false' until the matching 'end_entity'
 894         * rule is parsed. This allows unrestricted usage of parameter entities
 895         * within declarations of internal entities.
 896         *)
 897        let old_p_internal_subset = p_internal_subset in
 898        p_internal_subset <- false;
 899     }}
 900   decl_xmldecl_then_rest()
 901     {{ (* Restore the old value of 'p_internal_subset'. *)
 902        p_internal_subset <- old_p_internal_subset;
 903        ()
 904     }}
 905 | begin_entid:Conditional_begin
 906   $ {{ (* Check whether conditional sections are allowed at this position. *)
 907        if p_internal_subset then
 908          raise(WF_error("Restriction of the internal subset: Conditional sections not allowed"));
 909      }}
 910   Ignore*
 911   cond:conditional_section()  end_entid:Conditional_end
 912     {{ (* Check whether Conditional_begin and Conditional_end are in the same
 913         * entity. (This restriction is explained in the file SPECS.)
 914         *)
 915        if begin_entid != end_entid then
 916          raise(Validation_error("The first and the last token of conditional sections must be in the same entity (additional restriction of this parser)"));
 917     }}
 918   ? {{ match !yy_position with
 919            "end_entid" -> raise(WF_error("`>]>' expected"))
 920          | "cond"      -> raise(WF_error("INCLUDE or IGNORE expected"))
 921          | _           -> raise(WF_error("Bad conditional section"))
 922     }}
 923
 924 /* The tokens INCLUDE/IGNORE are scanned as names, and the selection of the
 925  * right parsing rule is dynamic.
 926  * Note that parse_ignored_section is not defined by a grammar rule but
 927  * by a conventional let-binding above.
 928  */
 929
 930 conditional_section():
 931   include_or_ignore:Name
 932   $ {{ let parsing_function =
 933          match include_or_ignore with
 934              "INCLUDE"  -> parse_included_section
 935                            (* invoke rule "included_section" below *)
 936            | "IGNORE"   -> parse_ignored_section
 937                            (* invoke function "parse_ignored_section" *)
 938            | _          -> raise(WF_error("INCLUDE or IGNORE expected"))
 939        in
 940     }}
 941   [ parsing_function ] ()
 942     {{ () }}
 943   ? {{ raise(WF_error("Bad conditional section")) }}
 944
 945 included_section():
 946   Conditional_body declaration()*
 947     {{ () }}
 948 | Ignore Ignore* Conditional_body declaration()*
 949     {{ () }}
 950
 951
 952 /*************************** ELEMENT DECLARATIONS ********************/
 953
 954 elementdecl():
 955   /* parses <!ELEMENT ... >. Puts the parsed element type as side-effect into
 956    * dtd.
 957    */
 958   decl_element_entid:      Decl_element
 959   $ {{ let extdecl = context.manager # current_entity_counts_as_external in
 960      }}
 961   ws1:                     Ignore Ignore*
 962   name:                    Name
 963   ws2:                     Ignore Ignore*
 964   content_model:           contentspec()
 965                            Ignore*
 966   decl_rangle_entid:       Decl_rangle
 967     {{
 968       if decl_element_entid != decl_rangle_entid then
 969         raise (Validation_error "Entities not properly nested with ELEMENT declaration");
 970       if extend_dtd then begin
 971         let el = new dtd_element dtd name in
 972         (* It is allowed that an <!ATTLIST...>  precedes the corresponding
 973          * <!ELEMENT...>. Because of this it is possible that there is already
 974          * an element called 'name' in the DTD, and we only must set the content
 975          * model of this element.
 976          *)
 977         try
 978           dtd # add_element el;
 979           el # set_cm_and_extdecl content_model extdecl;
 980         with
 981             Not_found ->  (* means: there is already an element 'name' *)
 982               let el' = dtd # element name in
 983               el' # set_cm_and_extdecl content_model extdecl;
 984               (* raises Validation_error if el' already has a content model *)
 985       end
 986     }}
 987   ? {{ match !yy_position with
 988            ("ws1"|"ws2")   -> raise(WF_error("Whitespace is missing"))
 989          | "name"          -> raise(WF_error("The name of the element is expected here"))
 990          | "content_model" -> raise(WF_error("Content model expression expected"))
 991          | "decl_rangle_entid" -> raise(WF_error("`>' expected"))
 992          | _                   -> raise(WF_error("Bad element type declaration"))
 993     }}
 994
 995 contentspec():
 996   /* parses a content model and returns it (type content_model_type) */
 997   name: Name   /* EMPTY or ANY */
 998     {{ match name with
 999           "EMPTY" -> Empty
1000         | "ANY"   -> Any
1001         | _       -> raise(WF_error("EMPTY, ANY, or a subexpression expected"))
1002     }}
1003 | entid:Lparen  Ignore*  term:mixed_or_regexp(entid)
1004     {{ term }}
1005   ? {{ raise(WF_error("Bad content model expression")) }}
1006
1007
1008 /* Many of the following rules have an lparen_entid argument. This is the
1009  * internal ID of the entity containing the corresponding left parenthesis;
1010  * by comparing it with the ID of the entity of the right parenthesis the
1011  * contraint is implemented that both parentheses must be in the same entity.
1012  */
1013
1014 mixed_or_regexp(lparen_entid):
1015   re: choice_or_seq(lparen_entid)
1016     {{ Regexp re }}
1017 | m: mixed(lparen_entid)
1018     {{ m }}
1019
1020
1021 multiplier():
1022   /* returns one of the multiplier symbols (?,*,+) */
1023   Plus
1024     {{ Plus }}
1025 | Star
1026     {{ Star }}
1027 | Qmark
1028     {{ Qmark }}
1029
1030
1031 mixed (lparen_entid) :
1032                    Pcdata
1033                    Ignore*
1034   material:        mixed_alternatives_top()
1035     {{
1036       let rest, rparen_entid = material in
1037       if lparen_entid != rparen_entid then
1038         raise (Validation_error "Entities not properly nested with parentheses");
1039       Mixed (MPCDATA :: rest)
1040     }}
1041   ? {{ raise(WF_error("Bad content model expression")) }}
1042
1043
1044 mixed_alternatives_top():
1045   entid: Rparen
1046     {{ [], entid }}
1047 | entid: RparenStar
1048     {{ [], entid }}
1049 | Bar Ignore* name:Name Ignore* names:mixed_alternative()* entid:RparenStar
1050     {{
1051        (MChild name :: names), entid
1052     }}
1053   ? {{ match !yy_position with
1054            "name"  -> raise(WF_error("Name expected"))
1055          | "entid" -> raise(WF_error("`)*' expected"))
1056          | _       -> raise(WF_error("Bad content model expression"))
1057     }}
1058
1059
1060 mixed_alternative() :
1061   Bar Ignore* name:Name Ignore*
1062     {{ MChild name }}
1063   ? {{ match !yy_position with
1064            "name" -> raise(WF_error("Name expected"))
1065          | _      -> raise(WF_error("Bad content model expression"))
1066     }}
1067
1068
1069
1070 choice_or_seq (lparen_entid):
1071   /* parses either a regular expression, or a mixed expression. Returns
1072    * Mixed spec or Regexp spec (content_model_type).
1073    * Which kind of expression (regexp or mixed) is being read is recognized
1074    * after the first subexpression has been parsed; the other subexpressions
1075    * must be of the same kind.
1076    */
1077   re:     cp()
1078           Ignore*
1079   factor: choice_or_seq_factor()
1080     {{
1081       let (finalmark,subexpr), rparen_entid = factor in
1082       if lparen_entid != rparen_entid then
1083         raise (Validation_error "Entities not properly nested with parentheses");
1084       (* Check that the other subexpressions are "regexp", too, and
1085        * merge them with the first.
1086        *)
1087       let re' =
1088         match subexpr with
1089             Alt []  ->  re
1090           | Alt alt -> Alt (re :: alt)
1091           | Seq seq -> Seq (re :: seq)
1092           | _       -> assert false
1093       in
1094       (* Interpret the finalmark. *)
1095       match finalmark with
1096           Ignore -> re'
1097         | Plus   -> Repeated1 re'
1098         | Star   -> Repeated re'
1099         | Qmark  -> Optional re'
1100         | _      -> assert false
1101     }}
1102   ? {{ raise(WF_error("Bad content model expression")) }}
1103
1104 choice_or_seq_factor():
1105   /* Parses "|<subexpr>|...)" or ",<subexpr>,...)", both forms optionally
1106    * followed by ?, *, or +.
1107    * Returns ((finalmark, expr), rparen_entid), where
1108    * - finalmark is the character after the right parenthesis or Ignore
1109    * - expr is either
1110    *   Alt []              meaning that only ")" has been found
1111    *   Alt non_empty_list  meaning that the subexpressions are separated by '|'
1112    *   Seq non_empty_list  meaning that the subexpressions are separated by ','
1113    */
1114   entid:Rparen
1115     {{ (Ignore, Alt []), entid }}
1116 | entid:RparenPlus
1117     {{ (Plus, Alt []), entid }}
1118 | entid:RparenStar
1119     {{ (Star, Alt []), entid }}
1120 | entid:RparenQmark
1121     {{ (Qmark, Alt []), entid }}
1122 | Bar Ignore* re:cp() Ignore* factor:choice_or_seq_factor()
1123     {{
1124       let (finalmark, subexpr), rparen_entid = factor in
1125       begin match subexpr with
1126           Alt []  -> (finalmark, (Alt [re])), rparen_entid
1127         | Alt alt -> (finalmark, (Alt (re :: alt))), rparen_entid
1128         | _       -> raise(WF_error("It is not allowed to mix alternatives and sequences"))
1129       end
1130     }}
1131   ? {{ raise(WF_error("Bad content model expression")) }}
1132 | Comma Ignore* re:cp() Ignore* factor:choice_or_seq_factor()
1133     {{
1134       let (finalmark, subexpr), rparen_entid = factor in
1135       begin match subexpr with
1136           Alt []  -> (finalmark, (Seq [re])), rparen_entid
1137         | Seq seq -> (finalmark, (Seq (re :: seq))), rparen_entid
1138         | _       -> raise(WF_error("It is not allowed to mix alternatives and sequences"))
1139       end
1140     }}
1141   ? {{ raise(WF_error("Bad content model expression")) }}
1142
1143 cp():
1144   /* parse either a name, or a parenthesized subexpression "(...)"  */
1145   name:Name  m:multiplier()?
1146     {{ match m with
1147           None       -> Child name
1148         | Some Plus  -> Repeated1 (Child name)
1149         | Some Star  -> Repeated  (Child name)
1150         | Some Qmark -> Optional  (Child name)
1151         | _          -> assert false
1152     }}
1153   ? {{ raise(WF_error("Bad content model expression")) }}
1154 | entid:Lparen Ignore* m:choice_or_seq(entid)
1155     {{ m }}
1156   ? {{ raise(WF_error("Bad content model expression")) }}
1157
1158
1159 /********************* ATTRIBUTE LIST DECLARATION ***********************/
1160
1161 attlistdecl():
1162   /* parses <!ATTLIST ... >. Enters the attribute list in dtd as side-
1163    * effect.
1164    */
1165   decl_attlist_entid: Decl_attlist
1166   $ {{ let extdecl = context.manager # current_entity_counts_as_external in
1167     }}
1168   ws1:                Ignore Ignore*
1169   el_name:            Name
1170   ws:                 Ignore? Ignore*
1171   factor:             attdef_factor()
1172     {{
1173       let at_list, decl_rangle_entid = factor in
1174
1175       if decl_attlist_entid != decl_rangle_entid then
1176         raise (Validation_error "Entities not properly nested with ATTLIST declaration");
1177
1178       if not ws && at_list <> [] then begin
1179         match at_list with
1180             (name,_,_) :: _ ->
1181               (* This is normally impossible, because the lexer demands
1182                * some other token between two names.
1183                *)
1184               raise(WF_error("Whitespace is missing before `" ^ name ^ "'"));
1185           | _ -> assert false
1186       end;
1187
1188       if extend_dtd then begin
1189         let new_el = new dtd_element dtd el_name in
1190         (* Note that it is allowed that <!ATTLIST...> precedes the corresponding
1191          * <!ELEMENT...> declaration. In this case we add the element declaration
1192          * already to the DTD but leave the content model unspecified.
1193          *)
1194         let el =
1195           try
1196             dtd # add_element new_el;
1197             new_el
1198           with
1199               Not_found ->  (* already added *)
1200                 let old_el = dtd # element el_name in
1201                 if old_el # attribute_names <>  [] then
1202                   config.warner # warn ("More than one ATTLIST declaration for element type `" ^
1203                                         el_name ^ "'");
1204                 old_el
1205         in
1206         List.iter
1207           (fun (a_name, a_type, a_default) ->
1208              el # add_attribute a_name a_type a_default extdecl)
1209           at_list
1210       end
1211     }}
1212   ? {{ match !yy_position with
1213            "ws1"     -> raise(WF_error("Whitespace is missing after ATTLIST"))
1214          | "el_name" -> raise(WF_error("The name of the element is expected here"))
1215          | "factor"  -> raise(WF_error("Another attribute name or `>' expected"))
1216          | _         -> raise(WF_error("Bad attribute declaration"))
1217     }}
1218
1219
1220 attdef_factor():
1221   /* parses a list of triples <name> <type> <default value> and returns the
1222    * list as (string * att_type * att_default) list.
1223    */
1224   attdef:attdef()   ws:Ignore?   Ignore*   factor:attdef_factor()
1225     {{
1226       let attdef_rest, decl_rangle_entid = factor in
1227       if not ws && attdef_rest <> [] then begin
1228         match attdef_rest with
1229             (name,_,_) :: _ ->
1230               raise(WF_error("Missing whitespace before `" ^ name ^ "'"));
1231           | _ -> assert false
1232       end;
1233       (attdef :: attdef_rest), decl_rangle_entid }}
1234   ? {{ match !yy_position with
1235          | "factor"  -> raise(WF_error("Another attribute name or `>' expected"))
1236          | _         -> raise(WF_error("Bad attribute declaration"))
1237     }}
1238 | entid:Decl_rangle
1239     {{ [], entid }}
1240
1241
1242 attdef():
1243   /* Parses a single triple */
1244   name:     Name
1245   ws1:      Ignore Ignore*
1246   tp:       atttype()
1247   ws2:      Ignore Ignore*
1248   default:  defaultdecl()
1249     {{ (name,tp,default) }}
1250   ? {{ match !yy_position with
1251            ("ws1"|"ws2") -> raise(WF_error("Whitespace is missing"))
1252          | "tp"          -> raise(WF_error("Type of attribute or `(' expected"))
1253          | "default"     -> raise(WF_error("#REQUIRED, #IMPLIED, #FIXED or a string literal expected"))
1254          | _             -> raise(WF_error("Bad attribute declaration"))
1255     }}
1256
1257 atttype():
1258   /* Parses an attribute type and returns it as att_type. */
1259   name:      Name
1260   $ {{ let followup =
1261          if name = "NOTATION" then
1262            parse_notation
1263          else
1264            parse_never
1265        in
1266      }}
1267   nota:      [followup]()?
1268     {{
1269        match name with
1270           "CDATA"    -> A_cdata
1271         | "ID"       -> A_id
1272         | "IDREF"    -> A_idref
1273         | "IDREFS"   -> A_idrefs
1274         | "ENTITY"   -> A_entity
1275         | "ENTITIES" -> A_entities
1276         | "NMTOKEN"  -> A_nmtoken
1277         | "NMTOKENS" -> A_nmtokens
1278         | "NOTATION" ->
1279             (match nota with
1280                  None   -> raise(WF_error("Error in NOTATION type (perhaps missing whitespace after NOTATION?)"))
1281                | Some n -> n
1282             )
1283         | _          -> raise(WF_error("One of CDATA, ID, IDREF, IDREFS, ENTITY, ENTITIES, NMTOKEN, NMTOKENS, NOTATION, or a subexpression expected"))
1284     }}
1285   ? {{ raise(WF_error("Bad attribute declaration (perhaps missing whitespace after NOTATION)")) }}
1286
1287 |         Lparen
1288           Ignore*
1289   name:   name_or_nametoken()
1290           Ignore*
1291   names:  nmtoken_factor()*
1292   rp:     Rparen
1293     /* Enumeration */
1294     {{ A_enum(name :: names) }}
1295   ? {{ match !yy_position with
1296            "name"  -> raise(WF_error("Name expected"))
1297          | "names" -> raise(WF_error("`|' and more names expected, or `)'"))
1298          | "rp"    -> raise(WF_error("`|' and more names expected, or `)'"))
1299          | _       -> raise(WF_error("Bad enumeration type"))
1300     }}
1301
1302
1303 never():
1304   /* The always failing rule */
1305   $ {{ raise Not_found; }}
1306   Doctype   /* questionable */
1307     {{ A_cdata    (* Does not matter *)
1308     }}
1309
1310
1311 notation():
1312          Ignore Ignore*
1313   lp:    Lparen
1314          Ignore*
1315   name:  Name
1316          Ignore*
1317   names: notation_factor()*
1318   rp:    Rparen
1319     {{ A_notation(name :: names) }}
1320   ? {{ match !yy_position with
1321            "lp"    -> raise(WF_error("`(' expected"))
1322          | "name"  -> raise(WF_error("Name expected"))
1323          | "names" -> raise(WF_error("`|' and more names expected, or `)'"))
1324          | "rp"    -> raise(WF_error("`|' and more names expected, or `)'"))
1325          | _       -> raise(WF_error("Bad NOTATION type"))
1326     }}
1327
1328
1329 notation_factor():
1330   /* Parse "|<name>" and return the name */
1331   Bar Ignore* name:Name Ignore*
1332     {{ name }}
1333   ? {{ match !yy_position with
1334            "name" -> raise(WF_error("Name expected"))
1335          | _      -> raise(WF_error("Bad NOTATION type"))
1336     }}
1337
1338 nmtoken_factor():
1339   /* Parse "|<nmtoken>" and return the nmtoken */
1340   Bar Ignore* n:name_or_nametoken() Ignore*
1341     {{ n }}
1342   ? {{ match !yy_position with
1343            "n" -> raise(WF_error("Nametoken expected"))
1344          | _   -> raise(WF_error("Bad enumeration type"))
1345     }}
1346
1347
1348 name_or_nametoken():
1349   n:Name      {{ n }}
1350 | n:Nametoken {{ n }}
1351
1352
1353 /* The default values must be expanded and normalized. This has been implemented
1354  * by the function expand_attvalue.
1355  */
1356
1357
1358 defaultdecl():
1359   /* Parse the default value for an attribute and return it as att_default */
1360   Required
1361     {{ D_required }}
1362 | Implied
1363     {{ D_implied }}
1364 | Fixed ws:Ignore Ignore* str:Unparsed_string
1365     {{ D_fixed (expand_attvalue lexerset dtd str config.warner false) }}
1366   ? {{ match !yy_position with
1367            "ws"  -> raise(WF_error("Whitespace is missing after #FIXED"))
1368          | "str" -> raise(WF_error("String literal expected"))
1369          | _     -> raise(WF_error("Bad #FIXED default value"))
1370     }}
1371 | str:Unparsed_string
1372     {{ D_default (expand_attvalue lexerset dtd str config.warner false) }}
1373
1374
1375 /**************************** ENTITY DECLARATION ***********************/
1376
1377 entitydecl(decl_entity_entid):
1378   /* parses everything _after_ <!ENTITY until the matching >. The parsed
1379    * entity declaration is entered into the dtd object as side-effect.
1380    */
1381   name:               Name
1382   $ {{ let extdecl = context.manager # current_entity_counts_as_external in
1383     }}
1384   ws:                 Ignore Ignore*
1385   material:           entitydef()
1386                       Ignore*
1387   decl_rangle_entid:  Decl_rangle
1388     /* A general entity */
1389     {{
1390        if decl_entity_entid != decl_rangle_entid then
1391          raise (Validation_error "Entities not properly nested with ENTITY declaration");
1392       let en =
1393         (* Distinguish between
1394          * - internal entities
1395          * - external entities
1396          * - NDATA (unparsed) entities
1397          *)
1398         match material with
1399             (Some s, None,     None)   ->
1400               new internal_entity dtd name config.warner s p_internal_subset
1401                   config.errors_with_line_numbers false config.encoding
1402           | (None,   Some xid, None)   ->
1403               new external_entity (resolver # clone) dtd name config.warner
1404                                   xid false config.errors_with_line_numbers
1405                                   config.encoding
1406
1407           | (None,   Some xid, Some n) ->
1408               (new ndata_entity name xid n config.encoding :> entity)
1409           | _ -> assert false
1410       in
1411       dtd # add_gen_entity en extdecl
1412     }}
1413   ? {{ match !yy_position with
1414            "ws"                -> raise(WF_error("Whitespace is missing"))
1415          | "material"          -> raise(WF_error("String literal or identifier expected"))
1416          | "decl_rangle_entid" -> raise(WF_error("`>' expected"))
1417          | _                   -> raise(WF_error("Bad entity declaration"))
1418     }}
1419
1420 |                     Percent
1421   $ {{ let extdecl = context.manager # current_entity_counts_as_external in
1422     }}
1423   ws1:                Ignore Ignore*
1424   name:               Name
1425   ws2:                Ignore Ignore*
1426   material:           pedef()
1427                       Ignore*
1428   decl_rangle_entid:  Decl_rangle
1429     /* A parameter entity */
1430     {{
1431       if decl_entity_entid != decl_rangle_entid then
1432          raise (Validation_error "Entities not properly nested with ENTITY declaration");
1433       let en =
1434         (* Distinguish between internal and external entities *)
1435         match material with
1436             (Some s, None)   ->
1437               new internal_entity dtd name config.warner s p_internal_subset
1438                   config.errors_with_line_numbers true config.encoding
1439           | (None,   Some xid)   ->
1440               new external_entity (resolver # clone) dtd name config.warner
1441                                   xid true config.errors_with_line_numbers
1442                                   config.encoding
1443           | _ -> assert false
1444       in
1445
1446       (* The following two lines force that even internal entities count
1447        * as external (for the standalone check) if the declaration of
1448        * the internal entity occurs in an external entity.
1449        *)
1450       if extdecl then
1451         en # set_counts_as_external;
1452
1453       dtd # add_par_entity en;
1454     }}
1455   ? {{ match !yy_position with
1456            ("ws1"|"ws2")       -> raise(WF_error("Whitespace is missing"))
1457          | "material"          -> raise(WF_error("String literal or identifier expected"))
1458          | "decl_rangle_entid" -> raise(WF_error("`>' expected"))
1459          | _                   -> raise(WF_error("Bad entity declaration"))
1460     }}
1461
1462
1463 entitydef():
1464   /* parses the definition value of a general entity. Returns either:
1465    * - (Some s, None,   None)    meaning the definition of an internal entity
1466    *                               with (literal) value s has been found
1467    * - (None,   Some x, None)    meaning that an external parsed entity with
1468    *                               external ID x has been found
1469    * - (None,   Some x, Some n)  meaning that an unparsed entity with
1470    *                               external ID x and notations n has been found
1471    */
1472   str:Unparsed_string
1473     {{ Some str, None, None }}
1474 | id:external_id()   ws:Ignore?  Ignore*  decl:ndatadecl()?
1475     {{  if not ws  && decl <> None then
1476           raise(WF_error("Whitespace missing before `NDATA'"));
1477         None, Some id, decl
1478     }}
1479
1480
1481 pedef():
1482   /* parses the definition value of a parameter entity. Returns either:
1483    * - (Some s, None)     meaning that the definition of an internal entity
1484    *                        with (literal) value s has been found
1485    * - (None,   Some x)   meaning that an external ID x has been found
1486    */
1487   str:Unparsed_string
1488     {{ Some str, None }}
1489 | id:external_id()
1490     {{ None, Some id }}
1491
1492
1493 ndatadecl():
1494   /* Parses either NDATA "string" or the empty string; returns Some "string"
1495    * in the former, None in the latter case.
1496    */
1497   ndata:Name ws:Ignore Ignore* name:Name
1498     {{ if ndata = "NDATA" then
1499         name
1500       else
1501         raise(WF_error("NDATA expected"))
1502     }}
1503   ? {{ match !yy_position with
1504            "ws"   -> raise(WF_error("Whitespace is missing after NDATA"))
1505          | "name" -> raise(WF_error("Name expected"))
1506          | _      -> raise(WF_error("Bad NDATA declaration"))
1507     }}
1508
1509 /**************************** NOTATION DECLARATION *******************/
1510
1511 notationdecl():
1512   /* parses <!NOTATION ... > and enters the notation declaration into the
1513    * dtd object as side-effect
1514    */
1515   decl_notation_entid: Decl_notation
1516   ws1:                 Ignore Ignore*
1517   name:                Name
1518   ws2:                 Ignore Ignore*
1519   sys_or_public:       Name /* SYSTEM or PUBLIC */
1520   ws3:                 Ignore Ignore*
1521   str1:                Unparsed_string
1522   ws:                  Ignore? Ignore*
1523   str2:                Unparsed_string?
1524                        Ignore*
1525   decl_rangle_entid:   Decl_rangle
1526     {{
1527       if decl_notation_entid != decl_rangle_entid then
1528         raise (Validation_error "Entities not properly nested with NOTATION declaration");
1529       let xid =
1530         (* Note that it is allowed that PUBLIC is only followed by one
1531          * string literal
1532          *)
1533         match sys_or_public with
1534             "SYSTEM" ->
1535               if str2 <> None then raise(WF_error("SYSTEM must be followed only by one argument"));
1536               System (recode_utf8 str1)
1537           | "PUBLIC" ->
1538               begin match str2 with
1539                   None ->
1540                     check_public_id str1;
1541                     Public(recode_utf8 str1,"")
1542                 | Some p ->
1543                     if not ws then
1544                       raise(WF_error("Missing whitespace between the string literals of the `PUBLIC' id"));
1545                     check_public_id str1;
1546                     Public(recode_utf8 str1, recode_utf8 p)
1547               end
1548           | _ -> raise(WF_error("PUBLIC or SYSTEM expected"))
1549       in
1550       if extend_dtd then begin
1551         let no = new dtd_notation name xid config.encoding in
1552         dtd # add_notation no
1553       end
1554     }}
1555   ? {{ match !yy_position with
1556            ("ws1"|"ws2"|"ws3") -> raise(WF_error("Whitespace is missing"))
1557          | "name"              -> raise(WF_error("Name expected"))
1558          | "sys_or_public"     -> raise(WF_error("SYSTEM or PUBLIC expected"))
1559          | ("str1"|"str2")     -> raise(WF_error("String literal expected"))
1560          | "decl_rangle_entid" -> raise(WF_error("`>' expected"))
1561          | _                   -> raise(WF_error("Bad NOTATION declaration"))
1562     }}
1563
1564 /****************************** ELEMENTS **************************/
1565
1566 /* In the following rules, the number of error rules is reduced to
1567  * improve the performance of the parser.
1568  */
1569
1570
1571 contents_start():
1572   /* parses <element>...</element> misc*, i.e. exactly one element followed
1573    * optionally by white space or processing instructions.
1574    * The element is entered into the global variables as follows:
1575    * - If elstack is non-empty, the parsed element is added as new child to
1576    *   the top element of the stack.
1577    * - If elstack is empty, the root_examplar object is modified rather than
1578    *   that a new element is created. If additionally the variable root is
1579    *   None, it is assigned Some root_examplar.
1580    * Note that the modification of the root_exemplar is done by the method
1581    * internal_init.
1582    * The reason why the root element is modified rather than newly created
1583    * is a typing requirement. It must be possible that the class of the root
1584    * is derived from the original class element_impl, i.e. the user must be
1585    * able to add additional methods. If we created a new root object, we
1586    * would have to denote to which class the new object belongs; the root
1587    * would always be an 'element_impl' object (and not a derived object).
1588    * If we instead cloned an  exemplar object and modified it by the
1589    * "create" method, the root object would belong to the same class as the
1590    * exemplar (good), but the type of the parsing function would always
1591    * state that an 'element_impl' was created (because we can pass the new
1592    * object only back via a global variable). The only solution is to
1593    * modify the object that has been passed to the parsing function directly.
1594    */
1595   $ {{ dtd <- transform_dtd dtd; }}
1596   start_tag() content()*
1597     {{ () }}
1598
1599
1600 content():
1601   /* parses: start tags, end tags, content, or processing
1602    * instructions. That the tags are properly nested is dynamically checked.
1603    * As result, recognized elements are added to their parent elements,
1604    * content is added to the element containing it, and processing instructions
1605    * are entered into the element embracing them. (All as side-effects.)
1606    */
1607   start_tag()
1608     {{ () }}
1609 | end_tag()
1610     {{ () }}
1611 | char_data()
1612     {{ () }}
1613 | cref()
1614     {{ () }}
1615 | pi()
1616     {{ () }}
1617 | entity_ref()
1618     {{ () }}
1619 | comment()
1620     {{ () }}
1621
1622
1623 entity_ref():
1624    Begin_entity eref_xmldecl_then_rest()
1625     {{ if n_tags_open = 0 then
1626         raise(WF_error("Entity reference not allowed here"))
1627     }}
1628
1629
1630 /* See comment for doc_mldecl_then_misc_then_prolog_then_rest. */
1631
1632 eref_xmldecl_then_rest():
1633   pl:PI_xml
1634   $ {{ context.manager # current_entity # process_xmldecl pl;
1635     }}
1636   content()* End_entity
1637     {{ () }}
1638
1639 | $ {{ context.manager # current_entity # process_missing_xmldecl; }}
1640   content() content()* End_entity
1641     {{ () }}
1642
1643 | $ {{ context.manager # current_entity # process_missing_xmldecl; }}
1644   End_entity
1645     {{ () }}
1646
1647
1648 start_tag():
1649   /* parses <element attribute-values> or <element attribute-values/>.
1650    *
1651    * EFFECT: If elstack is non-empty, the element is added to the
1652    * top element of the stack as new child, and the element
1653    * is pushed on the stack. If elstack is empty, the root_exemplar is
1654    * modified and gets the parsed name and attribute list. The root_exemplar
1655    * is pushed on the stack. If additionally the variable root is empty, too,
1656    * this variable is initialized.
1657    * If the <element ... /> form has been parsed, no element is pushed
1658    * on the stack.
1659    */
1660   tag:        Tag_beg
1661     $ {{ let position =
1662            if config.store_element_positions then
1663              Some(context.manager # position)
1664            else
1665              None
1666          in
1667        }}
1668   ws:         Ignore? Ignore*
1669   attlist:    attribute()*
1670   emptiness:  start_tag_rangle()
1671   /* Note: it is guaranteed that there is whitespace between Tag_beg and
1672    * the name of the first attribute, because there must be some separator.
1673    * So we need not to check ws!
1674    */
1675     {{
1676       let rec check_attlist al =
1677         match al with
1678             (nv1, num1) :: al' ->
1679               if not num1 && al' <> [] then begin
1680                 match al with
1681                     ((n1,_),_) :: ((n2,_),_) :: _ ->
1682                       raise(WF_error("Whitespace is missing between attributes `" ^
1683                                      n1 ^ "' and `" ^ n2 ^ "'"))
1684                   | _ -> assert false
1685               end;
1686               check_attlist al'
1687           | [] -> ()
1688       in
1689       check_attlist attlist;
1690
1691       let name, tag_beg_entid = tag in
1692       let attlist' = List.map (fun (nv,_) -> nv) attlist in
1693       let d =
1694         create_element_node ?position:position spec dtd name attlist' in
1695
1696       begin match id_index with
1697           None -> ()
1698         | Some idx ->
1699             (* Put the ID attribute into the index, if present *)
1700             begin try
1701               let v = d # id_attribute_value in  (* may raise Not_found *)
1702               idx # add v d                      (* may raise ID_not_unique *)
1703             with
1704                 Not_found ->
1705                   (* No ID attribute *)
1706                   ()
1707               | ID_not_unique ->
1708                   (* There is already an ID with the same value *)
1709                   raise(Validation_error("ID not unique"))
1710             end
1711       end;
1712
1713       if n_tags_open = 0 then begin
1714         if root = None then begin
1715           (* We have found the begin tag of the root element. *)
1716           if config.enable_super_root_node then begin
1717             (* The user wants the super root instead of the real root.
1718              * The real root element becomes the child of the VR.
1719              *)
1720             (* Assertion: self # current is the super root *)
1721             assert (self # current # node_type = T_super_root);
1722             root <- Some (self # current);
1723             self # current # add_node d;
1724             doc # init_root (self # current);
1725           end
1726           else begin
1727             (* Normal behaviour: The user wants to get the real root. *)
1728             root <- Some d;
1729             doc # init_root d;
1730           end;
1731         end
1732         else
1733           (* We have found a second topmost element. This is illegal. *)
1734           raise(WF_error("Document must consist of only one toplevel element"))
1735       end
1736       else begin
1737         (* We have found some inner begin tag. *)
1738         self # save_data;        (* Save outstanding data material first *)
1739         self # current # add_node d
1740       end;
1741
1742       if emptiness then
1743         (* An empty tag like <a/>. *)
1744         d # local_validate ~use_dfa:config.validate_by_dfa ()
1745       else begin
1746         (* A non-empty tag. *)
1747         Stack.push (d, tag_beg_entid) elstack;
1748         n_tags_open <- n_tags_open + 1;
1749       end;
1750     }}
1751   ? {{ match !yy_position with
1752            "attlist"   -> raise(WF_error("Bad attribute list"))
1753          | "emptiness" -> raise(WF_error("`>' or `/>' expected"))
1754          | _           -> raise(WF_error("Bad start tag"))
1755     }}
1756
1757
1758 attribute():
1759   /* Parses name="value"  */
1760   n:Name Ignore* Eq Ignore* v:attval() ws:Ignore? Ignore*
1761     {{ (n,v), ws }}
1762
1763
1764 attval():
1765   v:Attval
1766     {{ expand_attvalue lexerset dtd v config.warner true }}
1767 | v:Attval_nl_normalized
1768     {{ expand_attvalue lexerset dtd v config.warner false }}
1769
1770
1771 start_tag_rangle():
1772   Rangle       {{ false }}
1773 | Rangle_empty {{ true }}
1774
1775
1776 end_tag():
1777   /* parses </element>.
1778    * Pops the top element from the elstack and checks if it is the same
1779    * element.
1780    */
1781   tag:Tag_end  Ignore*  Rangle
1782     {{ let name, tag_end_entid = tag in
1783        if n_tags_open = 0 then
1784          raise(WF_error("End-tag without start-tag"));
1785
1786        self # save_data;        (* Save outstanding data material first *)
1787
1788        let x, tag_beg_entid = Stack.pop elstack in
1789        let x_name =
1790          match x # node_type with
1791            | T_element n -> n
1792            | _ -> assert false
1793        in
1794        if name <> x_name then
1795          raise(WF_error("End-tag does not match start-tag"));
1796        if tag_beg_entid != tag_end_entid then
1797          raise(WF_error("End-tag not in the same entity as the start-tag"));
1798        x # local_validate ~use_dfa:config.validate_by_dfa ();
1799
1800        n_tags_open <- n_tags_open - 1;
1801
1802        assert (n_tags_open >= 0);
1803
1804     }}
1805
1806 char_data():
1807   /* Parses any literal characters not otherwise matching, and adds the
1808    * characters to the top element of elstack.
1809    * If elstack is empty, it is assumed that there is no surrounding
1810    * element, and any non-white space character is forbidden.
1811    */
1812   data:CharData
1813     {{
1814       if n_tags_open = 0 then
1815         (* only white space is allowed *)
1816         self # only_whitespace data
1817       else
1818         self # collect_data data
1819           (* We collect the chardata material until the next end tag is
1820            * reached. Then the collected material will concatenated and
1821            * stored as a single T_data node (see end_tag rule above)
1822            * using save_data.
1823            *)
1824     }}
1825 | data:Cdata
1826     {{
1827       if n_tags_open = 0 then
1828         raise (WF_error("CDATA section not allowed here"));
1829       self # collect_data data
1830           (* Also collect CDATA material *)
1831     }}
1832
1833 cref():
1834   /* Parses &#...; and adds the character to the top element of elstack. */
1835   code:CRef
1836     {{
1837        if n_tags_open = 0 then
1838          (* No surrounding element: character references are not allowed *)
1839          raise(WF_error("Character reference not allowed here"));
1840        self # collect_data (character config.encoding config.warner code)
1841           (* Also collect character references *)
1842     }}
1843
1844 pi():
1845   /* Parses <?...?> (but not <?xml white-space ... ?>).
1846    * If there is a top element in elstack, the processing instruction is added
1847    * to this element.
1848    */
1849   pi: PI
1850     {{
1851       let position =
1852         if config.store_element_positions then
1853           Some(context.manager # position)
1854         else
1855           None
1856       in
1857       let target,value = pi in
1858
1859       if n_tags_open = 0 & not config.enable_super_root_node
1860       then
1861         doc # add_pinstr (new proc_instruction target value config.encoding)
1862       else begin
1863         (* Special case: if processing instructions are processed inline,
1864          * they are wrapped into T_pinstr nodes.
1865          *)
1866         if config.enable_pinstr_nodes then begin
1867           self # save_data;        (* Save outstanding data material first *)
1868           let pinstr = new proc_instruction target value config.encoding in
1869           let wrapper = create_pinstr_node
1870                           ?position:position spec dtd pinstr in
1871           wrapper # local_validate();                (* succeeds always   *)
1872           self # current # add_node wrapper;
1873         end
1874         else
1875           (* Normal behaviour: Add the PI to the parent element. *)
1876           self # current # add_pinstr
1877                              (new proc_instruction target value config.encoding)
1878       end
1879     }}
1880
1881
1882 comment():
1883   /* Parses <!-- ... -->
1884    */
1885   Comment_begin
1886   $ {{
1887       let position =
1888         if config.enable_comment_nodes && config.store_element_positions then
1889           Some(context.manager # position)
1890         else
1891           None
1892       in
1893     }}
1894   mat: Comment_material*
1895   ce: Comment_end
1896     {{
1897       if config.enable_comment_nodes then begin
1898         self # save_data;        (* Save outstanding data material first *)
1899         let comment_text = String.concat "" mat in
1900         let wrapper = create_comment_node
1901                         ?position:position spec dtd comment_text in
1902         wrapper # local_validate();                (* succeeds always   *)
1903         self # current # add_node wrapper;
1904       end
1905     }}
1906   ? {{ match !yy_position with
1907          | "ce"  -> raise(WF_error("`-->' expected"))
1908          | _     -> raise(WF_error("Bad comment"))
1909     }}
1910
1911
1912 %%
1913    (* The method "parse" continues here... *)
1914
1915    try
1916      match start_symbol with
1917          Ext_document ->
1918            parse_ext_document context.current context.get_next
1919        | Ext_declarations ->
1920            parse_ext_declarations context.current context.get_next
1921        | Ext_element ->
1922            parse_ext_element context.current context.get_next
1923    with
1924        Not_found ->
1925          raise Parsing.Parse_error
1926
1927   (*********** The method "parse" ends here *************)
1928
1929
1930 (**********************************************************************)
1931
1932 (* Here ends the class definition: *)
1933 end
1934 ;;
1935
1936 (**********************************************************************)
1937
1938 open Pxp_reader;;
1939
1940
1941 class default_ext =
1942   object(self)
1943     val mutable node = (None : ('a extension node as 'a) option)
1944     method clone = {< >}
1945     method node =
1946       match node with
1947           None ->
1948             assert false
1949         | Some n -> n
1950     method set_node n =
1951       node <- Some n
1952   end
1953 ;;
1954
1955
1956 let default_extension = new default_ext;;
1957
1958 let default_spec =
1959   make_spec_from_mapping
1960     ~super_root_exemplar:      (new element_impl default_extension)
1961     ~comment_exemplar:         (new element_impl default_extension)
1962     ~default_pinstr_exemplar:  (new element_impl default_extension)
1963     ~data_exemplar:            (new data_impl default_extension)
1964     ~default_element_exemplar: (new element_impl default_extension)
1965     ~element_mapping:          (Hashtbl.create 1)
1966     ()
1967 ;;
1968
1969
1970 let idref_pass id_index root =
1971   let error t att value =
1972     let name =
1973       match t # node_type with
1974           T_element name -> name
1975         | _ -> assert false
1976     in
1977     let text =
1978       "Attribute `" ^ att ^ "' of element `" ^ name ^
1979       "' refers to unknown ID `" ^ value ^ "'" in
1980     let pos_ent, pos_line, pos_col = t # position in
1981     if pos_line = 0 then
1982       raise(Validation_error text)
1983     else
1984       raise(At("In entity " ^ pos_ent ^ " at line " ^
1985                string_of_int pos_line ^ ", position " ^ string_of_int pos_col ^
1986                ":\n",
1987                Validation_error text))
1988   in
1989
1990   let rec check_tree t =
1991     let idref_atts = t # idref_attribute_names in
1992     List.iter
1993       (fun att ->
1994          match t # attribute att with
1995              Value s ->
1996                begin try ignore(id_index # find s) with
1997                    Not_found ->
1998                      error t att s
1999                end
2000            | Valuelist l ->
2001                List.iter
2002                  (fun s ->
2003                     try ignore(id_index # find s) with
2004                         Not_found ->
2005                           error t att s
2006                  )
2007                  l
2008            | Implied_value -> ()
2009       )
2010       idref_atts;
2011     List.iter check_tree (t # sub_nodes)
2012   in
2013   check_tree root
2014 ;;
2015
2016
2017 exception Return_DTD of dtd;;
2018   (* Used by extract_dtd_from_document_entity to jump out of the parser *)
2019
2020
2021 let call_parser ~configuration:cfg
2022                 ~source:src
2023                 ~dtd
2024                 ~extensible_dtd
2025                 ~document:doc
2026                 ~specification:spec
2027                 ~process_xmldecl
2028                 ~transform_dtd
2029                 ~(id_index : 'ext #index option)
2030                 ~use_document_entity
2031                 ~entry
2032                 ~init_lexer =
2033   let e = cfg.errors_with_line_numbers in
2034   let w = cfg.warner in
2035   let r, en =
2036     match src with
2037         Entity(m,r')  -> r', m dtd
2038       | ExtID(xid,r') -> r',
2039                          if use_document_entity then
2040                            new document_entity
2041                              r' dtd "[toplevel]" w xid e
2042                              cfg.encoding
2043                          else
2044                            new external_entity
2045                              r' dtd "[toplevel]" w xid false e
2046                              cfg.encoding
2047   in
2048   r # init_rep_encoding cfg.encoding;
2049   r # init_warner w;
2050   en # set_debugging_mode (cfg.debugging_mode);
2051   let pobj =
2052     new parser_object
2053       doc
2054       dtd
2055       extensible_dtd
2056       cfg
2057       r
2058       spec
2059       process_xmldecl
2060       transform_dtd
2061       (id_index :> 'ext index option)
2062   in
2063   let mgr = new entity_manager en in
2064   en # open_entity true init_lexer;
2065   begin try
2066     let context = make_context mgr in
2067     pobj # parse context entry;
2068     ignore(en # close_entity);
2069   with
2070       Return_DTD d ->
2071         ignore(en # close_entity);
2072         raise(Return_DTD d)
2073     | error ->
2074         ignore(en # close_entity);
2075         r # close_all;
2076         let pos = mgr # position_string in
2077         raise (At(pos, error))
2078   end;
2079   if cfg.idref_pass then begin
2080     match id_index with
2081         None -> ()
2082       | Some idx ->
2083           ( match pobj # root with
2084                 None -> ()
2085               | Some root ->
2086                   idref_pass idx root;
2087           )
2088   end;
2089   pobj
2090
2091
2092 let parse_dtd_entity cfg src =
2093   (* Parse a DTD given as separate entity. *)
2094   let dtd = new dtd cfg.warner cfg.encoding in
2095   let doc = new document cfg.warner in
2096   let pobj =
2097     call_parser
2098       ~configuration:cfg
2099       ~source:src
2100       ~dtd:dtd
2101       ~extensible_dtd:true         (* Extend the DTD by parsed declarations *)
2102       ~document:doc
2103       ~specification:default_spec
2104       ~process_xmldecl:false       (* The XML declaration is ignored
2105                                     * (except 'encoding')
2106                                     *)
2107       ~transform_dtd:(fun x -> x)  (* Do not transform the DTD *)
2108       ~id_index: None
2109       ~use_document_entity:false
2110       ~entry:Ext_declarations      (* Entry point of the grammar *)
2111       ~init_lexer:Declaration      (* The initially used lexer *)
2112   in
2113   dtd # validate;
2114   if cfg.accept_only_deterministic_models then dtd # only_deterministic_models;
2115   dtd
2116 ;;
2117
2118
2119 let parse_content_entity ?id_index cfg src dtd spec =
2120   (* Parse an element given as separate entity *)
2121   dtd # validate;            (* ensure that the DTD is valid *)
2122   if cfg.accept_only_deterministic_models then dtd # only_deterministic_models;
2123   let doc = new document cfg.warner in
2124   let pobj =
2125     call_parser
2126       ~configuration:cfg
2127       ~source:src
2128       ~dtd:dtd
2129       ~extensible_dtd:true         (* Extend the DTD by parsed declarations *)
2130       ~document:doc
2131       ~specification:spec
2132       ~process_xmldecl:false       (* The XML declaration is ignored
2133                                     * (except 'encoding')
2134                                     *)
2135       ~transform_dtd:(fun x -> x)  (* Do not transform the DTD *)
2136       ~id_index:(id_index :> 'ext index option)
2137       ~use_document_entity:false
2138       ~entry:Ext_element           (* Entry point of the grammar *)
2139       ~init_lexer:Content          (* The initially used lexer *)
2140   in
2141   match pobj # root with
2142       Some r -> r
2143     | None -> raise(WF_error("No root element"))
2144 ;;
2145
2146
2147 let parse_wfcontent_entity cfg src spec =
2148   let dtd = new dtd cfg.warner cfg.encoding in
2149   dtd # allow_arbitrary;
2150   let doc = new document cfg.warner in
2151   let pobj =
2152     call_parser
2153       ~configuration:cfg
2154       ~source:src
2155       ~dtd:dtd
2156       ~extensible_dtd:false        (* Do not extend the DTD *)
2157       ~document:doc
2158       ~specification:spec
2159       ~process_xmldecl:false       (* The XML declaration is ignored
2160                                     * (except 'encoding')
2161                                     *)
2162       ~transform_dtd:(fun x -> x)  (* Do not transform the DTD *)
2163       ~id_index:None
2164       ~use_document_entity:false
2165       ~entry:Ext_element           (* Entry point of the grammar *)
2166       ~init_lexer:Content          (* The initially used lexer *)
2167   in
2168   match pobj # root with
2169       Some r -> r
2170     | None -> raise(WF_error("No root element"))
2171 ;;
2172
2173
2174 let iparse_document_entity ?(transform_dtd = (fun x -> x))
2175                            ?id_index
2176                            cfg0 src spec p_wf =
2177   (* Parse an element given as separate entity *)
2178   (* p_wf: 'true' if in well-formedness mode, 'false' if in validating mode *)
2179   let cfg = { cfg0 with
2180                 recognize_standalone_declaration =
2181                    cfg0.recognize_standalone_declaration && (not p_wf)
2182             } in
2183   let dtd = new dtd cfg.warner cfg.encoding in
2184   if p_wf then
2185     dtd # allow_arbitrary;
2186   let doc = new document cfg.warner in
2187   let pobj =
2188     call_parser
2189       ~configuration:cfg
2190       ~source:src
2191       ~dtd:dtd
2192       ~extensible_dtd:(not p_wf)   (* Extend the DTD by parsed declarations
2193                                     * only if in validating mode
2194                                     *)
2195       ~document:doc
2196       ~specification:spec
2197       ~process_xmldecl:true        (* The XML declaration is processed *)
2198                                    (* TODO: change to 'not p_wf' ? *)
2199       ~transform_dtd:(fun dtd ->
2200                         let dtd' = transform_dtd dtd in
2201                         if cfg.accept_only_deterministic_models then
2202                           dtd' # only_deterministic_models;
2203                         dtd')
2204
2205       ~id_index:(id_index :> 'ext index option)
2206       ~use_document_entity:true
2207       ~entry:Ext_document          (* Entry point of the grammar *)
2208       ~init_lexer:Document         (* The initially used lexer *)
2209   in
2210   pobj # doc
2211 ;;
2212
2213
2214 let parse_document_entity ?(transform_dtd = (fun x -> x))
2215                           ?id_index
2216                           cfg src spec =
2217   iparse_document_entity
2218     ~transform_dtd:transform_dtd
2219     ?id_index:(id_index : 'ext #index option :> 'ext index option)
2220     cfg src spec false;;
2221
2222 let parse_wfdocument_entity cfg src spec =
2223   iparse_document_entity cfg src spec true;;
2224
2225 let extract_dtd_from_document_entity cfg src =
2226   let transform_dtd dtd = raise (Return_DTD dtd) in
2227   try
2228     let doc = parse_document_entity
2229                 ~transform_dtd:transform_dtd
2230                 cfg
2231                 src
2232                 default_spec in
2233     (* Should not happen: *)
2234     doc # dtd
2235   with
2236       Return_DTD dtd ->
2237         (* The normal case: *)
2238         dtd
2239 ;;
2240
2241
2242 let default_config =
2243   let w = new drop_warnings in
2244   { warner = w;
2245     errors_with_line_numbers = true;
2246     enable_pinstr_nodes = false;
2247     enable_super_root_node = false;
2248     enable_comment_nodes = false;
2249     encoding = `Enc_iso88591;
2250     recognize_standalone_declaration = true;
2251     store_element_positions = true;
2252     idref_pass = false;
2253     validate_by_dfa = true;
2254     accept_only_deterministic_models = true;
2255     debugging_mode = false;
2256   }
2257
2258
2259 class  [ 'ext ] hash_index =
2260 object
2261   constraint 'ext = 'ext node #extension
2262   val ht = (Hashtbl.create 100 : (string, 'ext node) Hashtbl.t)
2263   method add s n =
2264     try
2265       ignore(Hashtbl.find ht s);
2266       raise ID_not_unique
2267     with
2268         Not_found ->
2269           Hashtbl.add ht s n
2270
2271   method find s = Hashtbl.find ht s
2272   method index = ht
2273 end
2274
2275
2276 (* ======================================================================
2277  * History:
2278  *
2279  * $Log$
2280  * Revision 1.1  2000/11/17 09:57:29  lpadovan
2281  * Initial revision
2282  *
2283  * Revision 1.14  2000/08/26 23:23:14  gerd
2284  *      Bug: from_file must not interpret the file name as URL path.
2285  *      Bug: When PI and comment nodes are generated, the collected data
2286  * material must be saved first.
2287  *
2288  * Revision 1.13  2000/08/19 21:30:03  gerd
2289  *      Improved the error messages of the parser
2290  *
2291  * Revision 1.12  2000/08/18 20:16:25  gerd
2292  *      Implemented that Super root nodes, pinstr nodes and comment
2293  * nodes are included into the document tree.
2294  *
2295  * Revision 1.11  2000/08/14 22:24:55  gerd
2296  *      Moved the module Pxp_encoding to the netstring package under
2297  * the new name Netconversion.
2298  *
2299  * Revision 1.10  2000/07/23 02:16:33  gerd
2300  *      Support for DFAs.
2301  *
2302  * Revision 1.9  2000/07/14 13:57:29  gerd
2303  *      Added the id_index feature.
2304  *
2305  * Revision 1.8  2000/07/09 17:52:45  gerd
2306  *      New implementation for current_data.
2307  *      The position of elements is stored on demand.
2308  *
2309  * Revision 1.7  2000/07/09 01:00:35  gerd
2310  *      Improvement: It is now guaranteed that only one data node
2311  * is added for consecutive character material.
2312  *
2313  * Revision 1.6  2000/07/08 16:27:29  gerd
2314  *      Cleaned up the functions calling the parser.
2315  *      New parser argument: transform_dtd.
2316  *      Implementations for 'extract_dtd_from_document_entity' and
2317  * 'parse_wfcontent_entity'.
2318  *
2319  * Revision 1.5  2000/07/06 23:05:18  gerd
2320  *      Initializations of resolvers were missing.
2321  *
2322  * Revision 1.4  2000/07/06 22:11:01  gerd
2323  *      Fix: The creation of the non-virtual root element is protected
2324  * in the same way as the virtual root element.
2325  *
2326  * Revision 1.3  2000/07/04 22:15:18  gerd
2327  *      Change: Using the new resolver capabilities.
2328  *      Still incomplete: the new extraction and parsing functions.
2329  *
2330  * Revision 1.2  2000/06/14 22:19:06  gerd
2331  *      Added checks such that it is impossible to mix encodings.
2332  *
2333  * Revision 1.1  2000/05/29 23:48:38  gerd
2334  *      Changed module names:
2335  *              Markup_aux          into Pxp_aux
2336  *              Markup_codewriter   into Pxp_codewriter
2337  *              Markup_document     into Pxp_document
2338  *              Markup_dtd          into Pxp_dtd
2339  *              Markup_entity       into Pxp_entity
2340  *              Markup_lexer_types  into Pxp_lexer_types
2341  *              Markup_reader       into Pxp_reader
2342  *              Markup_types        into Pxp_types
2343  *              Markup_yacc         into Pxp_yacc
2344  * See directory "compatibility" for (almost) compatible wrappers emulating
2345  * Markup_document, Markup_dtd, Markup_reader, Markup_types, and Markup_yacc.
2346  *
2347  * ======================================================================
2348  * Old logs from markup_yacc.m2y:
2349  *
2350  * Revision 1.9  2000/05/29 21:14:57  gerd
2351  *      Changed the type 'encoding' into a polymorphic variant.
2352  *
2353  * Revision 1.8  2000/05/27 19:26:19  gerd
2354  *      Change: The XML declaration is interpreted right after
2355  * it has been parsed (no longer after the document): new function
2356  * check_and_parse_xmldecl.
2357  *      When elements, attributes, and entities are declared
2358  * it is stored whether the declaration happens in an external
2359  * entity (for the standalone check).
2360  *      The option recognize_standalone_declaration is interpreted.
2361  *
2362  * Revision 1.7  2000/05/20 20:31:40  gerd
2363  *      Big change: Added support for various encodings of the
2364  * internal representation.
2365  *
2366  * Revision 1.6  2000/05/14 21:51:24  gerd
2367  *      Change: Whitespace is handled by the grammar, and no longer
2368  * by the entity.
2369  *
2370  * Revision 1.5  2000/05/14 17:50:54  gerd
2371  *      Updates because of changes in the token type.
2372  *
2373  * Revision 1.4  2000/05/11 22:09:17  gerd
2374  *      Fixed the remaining problems with conditional sections.
2375  * This seems to be also a weakness of the XML spec!
2376  *
2377  * Revision 1.3  2000/05/09 00:02:44  gerd
2378  *      Conditional sections are now recognized by the parser.
2379  * There seem some open questions; see the TODO comments!
2380  *
2381  * Revision 1.2  2000/05/08 22:01:44  gerd
2382  *      Introduced entity managers (see markup_entity.ml).
2383  *      The XML declaration is now recognized by the parser. If such
2384  * a declaration is found, the method process_xmldecl of the currently
2385  * active entity is called. If the first token is not an XML declaration,
2386  * the method process_missing_xmldecl is called instead.
2387  *      Some minor changes.
2388  *
2389  * Revision 1.1  2000/05/06 23:21:49  gerd
2390  *      Initial revision.
2391  *
2392  *
2393  * ======================================================================
2394  *
2395  * COPIED FROM REVISION 1.19 OF markup_yacc.mly
2396  *
2397  * Revision 1.19  2000/05/01 15:20:08  gerd
2398  *      "End tag matches start tag" is checked before "End tag in the
2399  * same entity as start tag".
2400  *
2401  * Revision 1.18  2000/04/30 18:23:08  gerd
2402  *      Bigger change: Introduced the concept of virtual roots. First,
2403  * this reduces the number of checks. Second, it makes it possible to
2404  * return the virtual root to the caller instead of the real root (new
2405  * config options 'virtual_root' and 'processing_instructions_inline').
2406  *      Minor changes because of better CR/CRLF handling.
2407  *
2408  * Revision 1.17  2000/03/13 23:47:46  gerd
2409  *      Updated because of interface changes. (See markup_yacc_shadow.mli
2410  * rev. 1.8)
2411  *
2412  * Revision 1.16  2000/01/20 20:54:43  gerd
2413  *      New config.errors_with_line_numbers.
2414  *
2415  * Revision 1.15  1999/12/17 22:27:58  gerd
2416  *      Bugfix: The value of 'p_internal_subset' (an instance
2417  * variable of the parser object) is to true when the internal subset
2418  * begins, and is set to false when this subset ends. The error was
2419  * that references to external entities within this subset did not
2420  * set 'p_internal_subset' to false; this is now corrected by introducing
2421  * the 'p_internal_subset_stack'.
2422  *      This is a typical example of how the code gets more and
2423  * more complicated and that it is very difficult to really understand
2424  * what is going on.
2425  *
2426  * Revision 1.14  1999/11/09 22:23:37  gerd
2427  *      Removed the invocation of "init_dtd" of the root document.
2428  * This method is no longer available. The DTD is also passed to the
2429  * document object by the root element, so nothing essential changes.
2430  *
2431  * Revision 1.13  1999/10/25 23:37:09  gerd
2432  *      Bugfix: The warning "More than one ATTLIST declaration for element
2433  * type ..." is only generated if an ATTLIST is found while there are already
2434  * attributes for the element.
2435  *
2436  * Revision 1.12  1999/09/01 23:08:38  gerd
2437  *      New frontend function: parse_wf_document. This simply uses
2438  * a DTD that allows anything, and by the new parameter "extend_dtd" it is
2439  * avoided that element, attlist, and notation declarations are added to this
2440  * DTD. The idea is that this function simulates a well-formedness parser.
2441  *      Tag_beg, Tag_end carry the entity_id. The "elstack" stores the
2442  * entity_id of the stacked tag. This was necessary because otherwise there
2443  * are some examples to produces incorrectly nested elements.
2444  *      p_internal_subset is a variable that stores whether the internal
2445  * subset is being parsed. This is important beacause entity declarations in
2446  * internal subsets are not allowed to contain parameter references.
2447  *      It is checked if the "elstack" is empty after all has been parsed.
2448  *      Processing instructions outside DTDs and outside elements are now
2449  * added to the document.
2450  *      The rules of mixed and regexp style content models have been
2451  * separated. The code is now much simpler.
2452  *      Entity references outside elements are detected and rejected.
2453  *
2454  * Revision 1.11  1999/09/01 16:26:08  gerd
2455  *      Improved the quality of error messages.
2456  *
2457  * Revision 1.10  1999/08/31 19:13:31  gerd
2458  *      Added checks on proper PE nesting. The idea is that tokens such
2459  * as Decl_element and Decl_rangle carry an entity ID with them. This ID
2460  * is simply an object of type < >, i.e. you can only test on identity.
2461  * The lexer always produces tokens with a dummy ID because it does not
2462  * know which entity is the current one. The entity layer replaces the dummy
2463  * ID with the actual ID. The parser checks that the IDs of pairs such as
2464  * Decl_element and Decl_rangle are the same; otherwise a Validation_error
2465  * is produced.
2466  *
2467  * Revision 1.9  1999/08/15 20:42:01  gerd
2468  *      Corrected a misleading message.
2469  *
2470  * Revision 1.8  1999/08/15 20:37:34  gerd
2471  *      Improved error messages.
2472  *      Bugfix: While parsing document entities, the subclass document_entity is
2473  * now used instead of external_entity. The rules in document entities are a bit
2474  * stronger.
2475  *
2476  * Revision 1.7  1999/08/15 14:03:59  gerd
2477  *      Empty documents are not allowed.
2478  *      "CDATA section not allowed here" is a WF_error, not a Validation_
2479  * error.
2480  *
2481  * Revision 1.6  1999/08/15 02:24:19  gerd
2482  *      Removed some grammar rules that were used for testing.
2483  *      Documents without DTD can now have arbitrary elements (formerly
2484  * they were not allowed to have any element).
2485  *
2486  * Revision 1.5  1999/08/14 22:57:20  gerd
2487  *      It is allowed that external entities are empty because the
2488  * empty string is well-parsed for both declarations and contents. Empty
2489  * entities can be referenced anywhere because the references are replaced
2490  * by nothing. Because of this, the Begin_entity...End_entity brace is only
2491  * inserted if the entity is non-empty. (Otherwise references to empty
2492  * entities would not be allowed anywhere.)
2493  *      As a consequence, the grammar has been changed such that a
2494  * single Eof is equivalent to Begin_entity,End_entity without content.
2495  *
2496  * Revision 1.4  1999/08/14 22:20:01  gerd
2497  *         The "config" slot has now a component "warner" which is
2498  * an object with a "warn" method. This is used to warn about characters
2499  * that cannot be represented in the Latin 1 alphabet.
2500  *         Furthermore, there is a new component "debugging_mode".
2501  *         Some Parse_error exceptions have been changed into Validation_error.
2502  *         The interfaces of functions/classes imported from other modules
2503  * have changed; the invocations have been adapted.
2504  *         Contents may contain CDATA sections that have been forgotten.
2505  *
2506  * Revision 1.3  1999/08/11 15:00:41  gerd
2507  *      The Begin_entity ... End_entity brace is also possible in
2508  * 'contents'.
2509  *      The configuration passed to the parsing object contains always
2510  * the resolver that is actually used.
2511  *
2512  * Revision 1.2  1999/08/10 21:35:12  gerd
2513  *      The XML/encoding declaration at the beginning of entities is
2514  * evaluated. In particular, entities have now a method "xml_declaration"
2515  * which returns the name/value pairs of such a declaration. The "encoding"
2516  * setting is interpreted by the entity itself; "version", and "standalone"
2517  * are interpreted by Markup_yacc.parse_document_entity. Other settings
2518  * are ignored (this does not conform to the standard; the standard prescribes
2519  * that "version" MUST be given in the declaration of document; "standalone"
2520  * and "encoding" CAN be declared; no other settings are allowed).
2521  *      TODO: The user should be warned if the standard is not exactly
2522  * fulfilled. -- The "standalone" property is not checked yet.
2523  *
2524  * Revision 1.1  1999/08/10 00:35:52  gerd
2525  *      Initial revision.
2526  *
2527  *
2528  *)