helm/DEVEL/pxp/pxp/pxp_entity.ml

   1 (* $Id$
   2  * ----------------------------------------------------------------------
   3  * PXP: The polymorphic XML parser for Objective Caml.
   4  * Copyright by Gerd Stolpmann. See LICENSE for details.
   5  *)
   6
   7
   8 (* TODO:
   9  * - Wie verhindert man, dass ein internal entity eine XML-Dekl. im
  10  *   replacement text akzeptiert?
  11  *)
  12
  13
  14 open Pxp_types
  15 open Pxp_lexer_types
  16 open Pxp_aux
  17 open Pxp_reader
  18
  19 (* Hierarchy of parsing layers:
  20  *
  21  * - Parser: Pxp_yacc
  22  *   + gets input stream from the main entity object
  23  *   + checks most of the grammar
  24  *   + creates the DTD object as side-effect
  25  *   + creates the element tree as side-effect
  26  *   + creates further entity objects that are entered into the DTD
  27  * - Entity layer: Pxp_entity
  28  *   + gets input stream from the lexers, or another entity object
  29  *   + handles entity references: if a reference is encountered the
  30  *     input stream is redirected such that the tokens come from the
  31  *     referenced entity object
  32  *   + handles conditional sections
  33  * - Lexer layer: Pxp_lexers
  34  *   + gets input from lexbuffers created by resolvers
  35  *   + different lexers for different lexical contexts
  36  *   + a lexer returns pairs (token,lexid), where token is the scanned
  37  *     token, and lexid is the name of the lexer that must be used for
  38  *     the next token
  39  * - Resolver layer: Pxp_entity
  40  *   + a resolver creates the lexbuf from some character source
  41  *   + a resolver recodes the input and handles the encoding scheme
  42  *)
  43
  44 (**********************************************************************)
  45
  46 (* Variables of type 'state' are used to insert Begin_entity and End_entity
  47  * tokens into the stream.
  48  * - At_beginning: Nothing has been read so far
  49  * - First_token tok: A Begin_entity has been inserted; and the next token
  50  *   is 'tok' which is not Eof. (Begin_entity/End_entity must not be inserted
  51  *   if the entity is empty.)
  52  * - In_stream: After the first token has been read, but befor Eof.
  53  * - At_end: Eof has been read, and End_entity has been returned.
  54  *)
  55
  56 type state =
  57     At_beginning
  58   | Inserted_begin_entity
  59   | At_end
  60 ;;
  61
  62
  63 (**********************************************************************)
  64
  65 class virtual entity the_dtd the_name the_warner
  66               init_errors_with_line_numbers init_encoding =
  67   object (self)
  68     (* This class prescribes the type of all entity objects. Furthermore,
  69      * the default 'next_token' mechanism is implemented.
  70      *)
  71
  72     (* 'init_errors_with_line_numbers': whether error messages contain line
  73      * numbers or not.
  74      * Calculating line numbers is expensive.
  75      *)
  76
  77     val mutable dtd = the_dtd
  78     val mutable name = the_name
  79     val mutable warner = the_warner
  80
  81     val encoding = (init_encoding : rep_encoding)
  82     val lexerset = Pxp_lexers.get_lexer_set init_encoding
  83
  84     method encoding = encoding
  85     (* method lexerset = lexerset *)
  86
  87     val mutable manager = None
  88       (* The current entity_manager, see below *)
  89
  90     method private manager =
  91       ( match manager with
  92             None -> assert false
  93           | Some m -> m
  94       : < current_entity : entity;
  95           pop_entity : unit;
  96           push_entity : entity -> unit >
  97       )
  98
  99     method set_manager m = manager <- Some m
 100
 101
 102     val mutable lexbuf = Lexing.from_string ""
 103       (* The lexical buffer currently used as character source. *)
 104
 105     val mutable prolog = None
 106       (* Stores the initial <?xml ...?> token as PI_xml *)
 107
 108     val mutable prolog_pairs = []
 109       (* If prolog <> None, these are the (name,value) pairs of the
 110        * processing instruction.
 111        *)
 112
 113
 114     val mutable lex_id = Document
 115       (* The name of the lexer that should be used for the next token *)
 116
 117     method set_lex_id id = lex_id <- lex_id
 118
 119
 120
 121     val mutable force_parameter_entity_parsing = false
 122       (* 'true' forces that inner entities will always be embraced by
 123        *        Begin_entity and End_entity.
 124        * 'false': the inner entity itself decides this
 125        *)
 126
 127     val mutable check_text_declaration = true
 128       (* 'true': It is checked that the <?xml..?> declaration matches the
 129        *         production TextDecl.
 130        *)
 131
 132     val mutable normalize_newline = true
 133       (* Whether this entity converts CRLF or CR to LF, or not *)
 134
 135
 136     val mutable line = 1     (* current line *)
 137     val mutable column = 0   (* current column *)
 138     val mutable pos = 0      (* current absolute character position *)
 139     val errors_with_line_numbers = init_errors_with_line_numbers
 140
 141     val mutable p_line = 1
 142     val mutable p_column = 1
 143
 144     method line = p_line
 145     method column = p_column
 146
 147
 148     val mutable counts_as_external = false
 149
 150     method counts_as_external = counts_as_external
 151         (* Whether the entity counts as external (for the standalone check). *)
 152
 153     method set_counts_as_external =
 154       counts_as_external <- true
 155
 156
 157     val mutable last_token = Bof
 158       (* XXX
 159        * These two variables are used to check that between certain pairs of
 160        * tokens whitespaces exist. 'last_token' is simply the last token,
 161        * but not Ignore, and not PERef (which both represent whitespace).
 162        * 'space_seen' records whether Ignore or PERef was seen between this
 163        * token and 'last_token'.
 164        *)
 165
 166     val mutable deferred_token = None
 167       (* If you set this to Some tl, the next invocations of
 168        * next_token_from_entity will return the tokens in tl.
 169        * This makes it possible to insert tokens into the stream.
 170        *)
 171
 172     val mutable debug = false
 173
 174     method is_ndata = false
 175       (* Returns if this entity is an NDATA (unparsed) entity *)
 176
 177     method name = name
 178
 179     method virtual open_entity : bool -> lexers -> unit
 180         (* open_entity force_parsing lexid:
 181          * opens the entity, and the first token is scanned by the lexer
 182          * 'lexid'. 'force_parsing' forces that Begin_entity and End_entity
 183          * tokens embrace the inner tokens of the entity; otherwise this
 184          * depends on the entity.
 185          * By opening an entity, reading tokens from it, and finally closing
 186          * the entity, the inclusion methods "Included",
 187          * "Included if validating", and "Included as PE" can be carried out.
 188          * Which method is chosen depends on the 'lexid', i.e. the lexical
 189          * context: 'lexid = Content' performs "Included (if validating)" (we
 190          * are always validating); 'lexid = Declaration' performs
 191          * "Included as PE". The difference is which tokens are recognized,
 192          * and how spaces are handled.
 193          * 'force_parsing' causes that a Begin_entity token is inserted before
 194          * and an End_entity token is inserted after the entity. The yacc
 195          * rules allow the Begin_entity ... End_entity brace only at certain
 196          * positions; this is used to restrict the possible positions where
 197          * entities may be included, and to guarantee that the entity matches
 198          * a certain production of the grammar ("parsed entities").
 199          * 'open_entity' is currently invoked with 'force_parsing = true'
 200          * for toplevel nodes, for inclusion of internal general entities,
 201          * and for inclusion of parameter entities into document entities.
 202          * 'force_parsing = false' is used for all other cases: External
 203          * entities add the Begin_entity/End_entity tokens anyway; internal
 204          * entities do not. Especially internal parameter entities referenced
 205          * from non-document entities do not add these tokens.
 206          *)
 207
 208     method virtual close_entity : lexers
 209         (* close_entity:
 210          * closes the entity and returns the name of the lexer that must
 211          * be used to scan the next token.
 212          *)
 213
 214     method virtual replacement_text : (string * bool)
 215         (* replacement_text:
 216          * returns the replacement text of the entity, and as second value,
 217          * whether the replacement text was constructed by referencing
 218          * external entities (directly or indirectly).
 219          * This method implements the inclusion method "Included in Literal".
 220          *)
 221
 222
 223     method lexbuf = lexbuf
 224
 225
 226     method xml_declaration =
 227       (* return the (name,value) pairs of the initial <?xml name=value ...?>
 228        * processing instruction.
 229        *)
 230       match prolog with
 231           None ->
 232             None
 233         | Some p ->
 234             Some prolog_pairs
 235
 236
 237     method set_debugging_mode m =
 238       debug <- m
 239
 240     method private virtual set_encoding : string -> unit
 241
 242
 243     method full_name =
 244       name
 245
 246
 247     method next_token =
 248       (* read next token from this entity *)
 249
 250       match deferred_token with
 251           Some toklist ->
 252             ( match toklist with
 253                   [] ->
 254                     deferred_token <- None;
 255                     self # next_token
 256                 | tok :: toklist' ->
 257                     deferred_token <- Some toklist';
 258                     if debug then
 259                       prerr_endline ("- Entity " ^ name ^ ": " ^ string_of_tok tok ^ " (deferred)");
 260                     tok
 261             )
 262         | None -> begin
 263             let this_line = line
 264             and this_column = column in
 265             let this_pos = pos in
 266             p_line <- this_line;
 267             p_column <- this_column;
 268             (* Read the next token from the appropriate lexer lex_id, and get the
 269              * name lex_id' of the next lexer to be used.
 270              *)
 271             let tok, lex_id' =
 272               match lex_id with
 273                   Document         -> lexerset.scan_document lexbuf
 274                 | Document_type    -> lexerset.scan_document_type lexbuf
 275                 | Content          -> lexerset.scan_content lexbuf
 276                 | Within_tag       -> lexerset.scan_within_tag lexbuf
 277                 | Declaration      -> lexerset.scan_declaration lexbuf
 278                 | Content_comment  -> lexerset.scan_content_comment lexbuf
 279                 | Decl_comment     -> lexerset.scan_decl_comment lexbuf
 280                 | Document_comment -> lexerset.scan_document_comment lexbuf
 281                 | Ignored_section  -> assert false
 282                       (* Ignored_section: only used by method next_ignored_token *)
 283             in
 284             if debug then
 285               prerr_endline ("- Entity " ^ name ^ ": " ^ string_of_tok tok);
 286             (* Find out the number of lines and characters of the last line: *)
 287             let n_lines, n_columns =
 288               if errors_with_line_numbers then
 289                 count_lines (Lexing.lexeme lexbuf)
 290               else
 291                 0, (Lexing.lexeme_end lexbuf - Lexing.lexeme_start lexbuf)
 292             in
 293             line <- this_line + n_lines;
 294             column <- if n_lines = 0 then this_column + n_columns else n_columns;
 295             pos <- Lexing.lexeme_end lexbuf;
 296             lex_id <- lex_id';
 297             (* Throw Ignore and Comment away; Interpret entity references: *)
 298             (* NOTE: Of course, references to general entities are not allowed
 299              * everywhere; parameter references, too. This is already done by the
 300              * lexers, i.e. &name; and %name; are recognized only where they
 301              * are allowed.
 302              *)
 303
 304             (* TODO: last_token is only used to detect Bof. Can be simplified *)
 305
 306             let at_bof = (last_token = Bof) in
 307             last_token <- tok;
 308
 309             let tok' =
 310               match tok with
 311
 312           (* Entity references: *)
 313
 314                 | ERef n    ->
 315                     let en, extdecl = dtd # gen_entity n in
 316                     if dtd # standalone_declaration && extdecl then
 317                       raise
 318                         (Validation_error
 319                            ("Reference to entity `" ^ n ^
 320                             "' violates standalone declaration"));
 321                     en # set_debugging_mode debug;
 322                     en # open_entity true lex_id;
 323                     self # manager # push_entity en;
 324                     en # next_token;
 325                 | PERef n   ->
 326                     let en = dtd # par_entity n in
 327                     en # set_debugging_mode debug;
 328                     en # open_entity force_parameter_entity_parsing lex_id;
 329                     self # manager # push_entity en;
 330                     en # next_token;
 331
 332           (* Convert LineEnd to CharData *)
 333                 | LineEnd s ->
 334                     if normalize_newline then
 335                       CharData "\n"
 336                     else
 337                       CharData s
 338
 339           (* Also normalize CDATA sections *)
 340                 | Cdata value as cd ->
 341                     if normalize_newline then
 342                       Cdata(normalize_line_separators lexerset value)
 343                     else
 344                       cd
 345
 346           (* If there are CRLF sequences in a PI value, normalize them, too *)
 347                 | PI(name,value) as pi ->
 348                     if normalize_newline then
 349                       PI(name, normalize_line_separators lexerset value)
 350                     else
 351                       pi
 352
 353           (* Attribute values: If they are already normalized, they are turned
 354            * into Attval_nl_normalized. This is detected by other code.
 355            *)
 356                 | Attval value as av ->
 357                     if normalize_newline then
 358                       av
 359                     else
 360                       Attval_nl_normalized value
 361
 362           (* Another CRLF normalization case: Unparsed_string *)
 363                 | Unparsed_string value as ustr ->
 364                     if normalize_newline then
 365                       Unparsed_string(normalize_line_separators lexerset value)
 366                     else
 367                       ustr
 368
 369           (* These tokens require that the entity_id parameter is set: *)
 370                 | Doctype _      -> Doctype       (self :> entity_id)
 371                 | Doctype_rangle _ ->Doctype_rangle(self :> entity_id)
 372                 | Dtd_begin _    -> Dtd_begin     (self :> entity_id)
 373                 | Dtd_end _      -> Dtd_end       (self :> entity_id)
 374                 | Decl_element _ -> Decl_element  (self :> entity_id)
 375                 | Decl_attlist _ -> Decl_attlist  (self :> entity_id)
 376                 | Decl_entity _  -> Decl_entity   (self :> entity_id)
 377                 | Decl_notation _ ->Decl_notation (self :> entity_id)
 378                 | Decl_rangle _  -> Decl_rangle   (self :> entity_id)
 379                 | Lparen _       -> Lparen        (self :> entity_id)
 380                 | Rparen _       -> Rparen        (self :> entity_id)
 381                 | RparenPlus _   -> RparenPlus    (self :> entity_id)
 382                 | RparenStar _   -> RparenStar    (self :> entity_id)
 383                 | RparenQmark _  -> RparenQmark   (self :> entity_id)
 384                 | Conditional_begin _ -> Conditional_begin (self :> entity_id)
 385                 | Conditional_body _  -> Conditional_body  (self :> entity_id)
 386                 | Conditional_end _   -> Conditional_end   (self :> entity_id)
 387                 | Tag_beg (n,_)  -> Tag_beg (n, (self :> entity_id))
 388                 | Tag_end (n,_)  -> Tag_end (n, (self :> entity_id))
 389
 390           (* End of file: *)
 391
 392                 | Eof       ->
 393                     if debug then begin
 394                       prerr_endline ("- Entity " ^ name ^ " # handle_eof");
 395                       let tok = self # handle_eof in
 396                       prerr_endline ("- Entity " ^ name ^ " # handle_eof: returns " ^ string_of_tok tok);
 397                       tok
 398                     end
 399                     else
 400                       self # handle_eof;
 401
 402           (* The default case. *)
 403
 404                 | _         ->
 405                     tok
 406
 407             in
 408             if at_bof & tok <> Eof
 409             then begin
 410               if debug then
 411                 prerr_endline ("- Entity " ^ name ^ " # handle_bof");
 412               self # handle_bof tok'
 413             end
 414             else
 415               tok'
 416           end
 417
 418
 419     (* 'handle_bof' and 'handle_eof' can be used as hooks. Behaviour:
 420      *
 421      * - Normally, the first token t is read in, and 'handle_bof t' is
 422      *   called. The return value of this method is what is returned to
 423      *   the user.
 424      * - If the EOF has been reached, 'handle_eof' is called.
 425      * - BUT: If the first token is already EOF, 'handle_eof' is called
 426      *   ONLY, and 'handle_bof' is NOT called.
 427      *
 428      * The default implementations:
 429      * - handle_bof: does nothing
 430      * - handle_eof: Pops the previous entity from the stack, switches back
 431      *   to this entity, and returns the next token of this entity.
 432      *)
 433
 434
 435     method private handle_bof tok =
 436       tok
 437
 438
 439     method private handle_eof =
 440       let mng = self # manager in
 441       begin try
 442         mng # pop_entity;
 443         let next_lex_id = self # close_entity in
 444         let en = mng # current_entity in
 445         en # set_lex_id next_lex_id;
 446         en # next_token
 447       with
 448           Stack.Empty ->
 449             (* The outermost entity is at EOF *)
 450             Eof
 451       end
 452
 453
 454     method next_ignored_token =
 455         (* used after <![ IGNORE *)
 456
 457       (* TODO: Do we need a test on deferred tokens here? *)
 458
 459         let this_line = line
 460         and this_column = column in
 461         let this_pos = pos in
 462         let tok, lex_id' = lexerset.scan_ignored_section lexbuf in
 463         if debug then
 464           prerr_endline ("- Entity " ^ name ^ ": " ^ string_of_tok tok ^ " (Ignored)");
 465         let n_lines, n_columns = count_lines (Lexing.lexeme lexbuf) in
 466         line <- this_line + n_lines;
 467         column <- if n_lines = 0 then this_column + n_columns else n_columns;
 468         pos <- Lexing.lexeme_end lexbuf;
 469         match tok with
 470           | Conditional_begin _ -> Conditional_begin (self :> entity_id)
 471           | Conditional_end _   -> Conditional_end   (self :> entity_id)
 472           | _                   -> tok
 473
 474
 475     method process_xmldecl pl =
 476       (* The parser calls this method just after the XML declaration
 477        * <?xml ...?> has been detected.
 478        * 'pl': This is the argument of the PI_xml token.
 479        *)
 480       if debug then
 481         prerr_endline ("- Entity " ^ name ^ " # process_xmldecl");
 482       prolog <- Some pl;
 483       prolog_pairs <- decode_xml_pi pl;
 484       if check_text_declaration then
 485         check_text_xml_pi prolog_pairs;
 486       begin
 487         try
 488           let e = List.assoc "encoding" prolog_pairs in
 489           self # set_encoding e
 490         with
 491             Not_found ->
 492               self # set_encoding ""
 493       end;
 494
 495
 496     method process_missing_xmldecl =
 497       (* The parser calls this method if the XML declaration is missing *)
 498       if debug then
 499         prerr_endline ("- Entity " ^ name ^ " # process_missing_xmldecl");
 500       self # set_encoding ""
 501
 502
 503     (* Methods for NDATA entities only: *)
 504     method ext_id = (assert false : ext_id)
 505     method notation = (assert false : string)
 506
 507   end
 508 ;;
 509
 510
 511 class ndata_entity the_name the_ext_id the_notation init_encoding =
 512   object (self)
 513     (* An NDATA entity is very restricted; more or less you can only find out
 514      * its external ID and its notation.
 515      *)
 516
 517     val mutable name = the_name
 518     val mutable ext_id = the_ext_id
 519     val mutable notation = the_notation
 520     val encoding = (init_encoding : rep_encoding)
 521
 522     method name = (name : string)
 523     method ext_id = (ext_id : ext_id)
 524     method notation = (notation : string)
 525
 526     method is_ndata = true
 527
 528     method encoding = encoding
 529
 530
 531     val mutable counts_as_external = false
 532
 533     method counts_as_external = counts_as_external
 534         (* Whether the entity counts as external (for the standalone check). *)
 535
 536     method set_counts_as_external =
 537       counts_as_external <- true
 538
 539
 540     method set_manager (m : < current_entity : entity;
 541                               pop_entity : unit;
 542                               push_entity : entity -> unit >) =
 543       ( raise (Validation_error ("Invalid reference to NDATA entity " ^ name))
 544           : unit )
 545
 546     method set_lex_id (id : lexers) =
 547       ( raise (Validation_error ("Invalid reference to NDATA entity " ^ name))
 548           : unit )
 549
 550     method line =
 551       ( raise (Validation_error ("Invalid reference to NDATA entity " ^ name))
 552           : int )
 553
 554     method column =
 555       ( raise (Validation_error ("Invalid reference to NDATA entity " ^ name))
 556           : int )
 557
 558     method full_name =
 559       ( raise (Validation_error ("Invalid reference to NDATA entity " ^ name))
 560           : string )
 561
 562     method private set_encoding (_:string) =
 563       assert false
 564
 565     method xml_declaration = (None : (string*string) list option)
 566
 567     method set_debugging_mode (_:bool) = ()
 568
 569     method open_entity (_:bool) (_:lexers) =
 570       ( raise (Validation_error ("Invalid reference to NDATA entity " ^ name))
 571           : unit )
 572
 573     method close_entity =
 574       ( raise (Validation_error ("Invalid reference to NDATA entity " ^ name))
 575           : lexers )
 576
 577     method replacement_text =
 578       ( raise (Validation_error ("Invalid reference to NDATA entity " ^ name))
 579           : (string * bool) )
 580
 581     method lexbuf =
 582       ( raise (Validation_error ("Invalid reference to NDATA entity " ^ name))
 583           : Lexing.lexbuf )
 584
 585     method next_token =
 586       ( raise (Validation_error ("Invalid reference to NDATA entity " ^ name))
 587           : token )
 588
 589     method next_ignored_token =
 590       ( raise (Validation_error ("Invalid reference to NDATA entity " ^ name))
 591           : token )
 592
 593     method process_xmldecl (pl:prolog_token list) =
 594       ( raise (Validation_error ("Invalid reference to NDATA entity " ^ name))
 595           : unit )
 596
 597     method process_missing_xmldecl =
 598       ( raise (Validation_error ("Invalid reference to NDATA entity " ^ name))
 599           : unit )
 600
 601   end
 602 ;;
 603
 604
 605 class external_entity the_resolver the_dtd the_name the_warner the_ext_id
 606                       the_p_special_empty_entities
 607                       init_errors_with_line_numbers
 608                       init_encoding
 609   =
 610   object (self)
 611     inherit entity
 612               the_dtd the_name the_warner init_errors_with_line_numbers
 613               init_encoding
 614             as super
 615
 616     (* An external entity gets the lexbuf that is used as character source
 617      * from a resolver.
 618      * Furthermore, before the first token an Begin_entity is inserted, and
 619      * before Eof an End_entity token is inserted into the stream. This done
 620      * always regardless of the argument 'force_parsing' of the method
 621      * 'open_entity'.
 622      *
 623      * 'the_p_internal_subset': see class internal_entity
 624      * 'the_p_special_empty_entities': if true, a Begin_entity/End_entity
 625      * brace is left out if the entity is otherwise empty.
 626      *)
 627
 628     val resolver = (the_resolver : resolver)
 629     val ext_id = (the_ext_id : ext_id)
 630
 631     val p_special_empty_entities = (the_p_special_empty_entities : bool)
 632
 633     val mutable resolver_is_open = false
 634       (* Track if the resolver is open. This is also used to find recursive
 635        * references of entities.
 636        *)
 637
 638     val mutable state = At_beginning
 639
 640     initializer
 641       counts_as_external <- true;
 642
 643
 644     method private set_encoding e =
 645       assert resolver_is_open;
 646       resolver # change_encoding e
 647
 648
 649     method full_name =
 650       name ^
 651       match ext_id with
 652           System s    -> " = SYSTEM \"" ^ s ^ "\""
 653         | Public(p,s) -> " = PUBLIC \"" ^ p ^ "\" \"" ^ s ^ "\""
 654         | Anonymous   -> " = ANONYMOUS"
 655
 656
 657     method open_entity force_parsing init_lex_id =
 658       (* Note that external entities are always parsed, i.e. Begin_entity
 659        * and End_entity tokens embrace the inner tokens to force that
 660        * the entity is only called where the syntax allows it.
 661        *)
 662       if resolver_is_open then
 663         raise(Validation_error("Recursive reference to entity `" ^ name ^ "'"));
 664       let lex =
 665         try
 666           resolver # open_in ext_id
 667         with
 668             Pxp_reader.Not_competent ->
 669               raise(Error ("No input method available for this external entity: " ^
 670                         self # full_name))
 671           | Pxp_reader.Not_resolvable Not_found ->
 672               raise(Error ("Unable to open the external entity: " ^
 673                            self # full_name))
 674           | Pxp_reader.Not_resolvable e ->
 675               raise(Error ("Unable to open the external entity: " ^
 676                            self # full_name ^ "; reason: " ^
 677                            string_of_exn e))
 678       in
 679       resolver_is_open <- true;
 680       lexbuf  <- lex;
 681       prolog  <- None;
 682       lex_id  <- init_lex_id;
 683       state <- At_beginning;
 684       line <- 1;
 685       column <- 0;
 686       pos <- 0;
 687       last_token <- Bof;
 688       normalize_newline <- true;
 689
 690
 691     method private handle_bof tok =
 692       (* This hook is only called if the stream is not empty. *)
 693       deferred_token <- Some [ tok ];
 694       state <- Inserted_begin_entity;
 695       Begin_entity
 696
 697
 698     method private handle_eof =
 699       (* This hook is called if the end of  the stream is reached *)
 700       match state with
 701           At_beginning ->
 702             (* This is only possible if the stream is empty. *)
 703             if p_special_empty_entities then begin
 704               (* Continue immediately with the next token *)
 705               state <- At_end;
 706               super # handle_eof
 707             end
 708             else begin
 709               (* Insert Begin_entity / End_entity *)
 710               deferred_token <- Some [ End_entity ];
 711               state <- At_end;
 712               Begin_entity;
 713               (* After these two token have been processed, the lexer
 714                * is called again, and it will return another Eof.
 715                *)
 716             end
 717         | Inserted_begin_entity ->
 718             (* Insert End_entity, too. *)
 719             state <- At_end;
 720             End_entity;
 721         | At_end ->
 722             (* Continue with the next token: *)
 723             super # handle_eof
 724
 725
 726     method close_entity =
 727       if not resolver_is_open then
 728         failwith ("External entity " ^ name ^ " not open");
 729       resolver # close_in;
 730       resolver_is_open <- false;
 731       lex_id
 732
 733
 734     method replacement_text =
 735       (* Return the replacement text of the entity. The method used for this
 736        * is more or less the same as for internal entities; i.e. character
 737        * and parameter entities are resolved immediately. In addition to that,
 738        * external entities may begin with an "xml" processing instruction
 739        * which is considered not to be part of the replacement text.
 740        *)
 741       if resolver_is_open then
 742         raise(Validation_error("Recursive reference to entity `" ^ name ^ "'"));
 743       let lex = resolver # open_in ext_id in
 744       resolver_is_open <- true;
 745       lexbuf  <- lex;
 746       prolog  <- None;
 747       (* arbitrary:    lex_id  <- init_lex_id; *)
 748       state <- At_beginning;
 749       line <- 1;
 750       column <- 0;
 751       pos <- 0;
 752       last_token <- Bof;
 753       (* First check if the first token of 'lex' is <?xml...?> *)
 754       begin match lexerset.scan_only_xml_decl lex with
 755           PI_xml pl ->
 756             self # process_xmldecl pl
 757         | Eof ->
 758             (* This only means that the first token was not <?xml...?>;
 759              * the "Eof" token represents the empty string.
 760              *)
 761             self # process_missing_xmldecl
 762         | _ ->
 763             (* Must not happen. *)
 764             assert false
 765       end;
 766       (* Then create the replacement text. *)
 767       let rec scan_and_expand () =
 768         match lexerset.scan_dtd_string lexbuf with
 769             ERef n -> "&" ^ n ^ ";" ^ scan_and_expand()
 770           | CRef(-1) -> "\n" ^ scan_and_expand()
 771           | CRef(-2) -> "\n" ^ scan_and_expand()
 772           | CRef(-3) -> "\n" ^ scan_and_expand()
 773           | CRef k -> character encoding warner k ^ scan_and_expand()
 774           | CharData x -> x ^ scan_and_expand()
 775           | PERef n ->
 776               let en = dtd # par_entity n in
 777               let (x,_) = en # replacement_text in
 778               x ^ scan_and_expand()
 779           | Eof ->
 780               ""
 781           | _ ->
 782               assert false
 783       in
 784       let rtext = scan_and_expand() in
 785       resolver # close_in;
 786       resolver_is_open <- false;
 787       rtext, true
 788         (* TODO:
 789          * - The replaced text is not parsed [VALIDATION WEAKNESS]
 790          *)
 791   end
 792 ;;
 793
 794
 795 class document_entity  the_resolver the_dtd the_name the_warner the_ext_id
 796                        init_errors_with_line_numbers
 797                        init_encoding
 798   =
 799   object (self)
 800     inherit external_entity  the_resolver the_dtd the_name the_warner
 801                              the_ext_id false init_errors_with_line_numbers
 802                              init_encoding
 803
 804     (* A document entity is an external entity that does not allow
 805      * conditional sections, and that forces that internal parameter entities
 806      * are properly nested.
 807      *)
 808
 809     initializer
 810     force_parameter_entity_parsing <- true;
 811     check_text_declaration <- false;
 812
 813     method counts_as_external = false
 814       (* Document entities count never as external! *)
 815   end
 816 ;;
 817
 818
 819 class internal_entity the_dtd the_name the_warner the_literal_value
 820                       the_p_internal_subset init_errors_with_line_numbers
 821                       init_is_parameter_entity
 822                       init_encoding
 823   =
 824   (* An internal entity uses a "literal entity value" as character source.
 825    * This value is first expanded and preprocessed, i.e. character and
 826    * parameter references are expanded.
 827    *
 828    * 'the_p_internal_subset': indicates that the entity is declared in the
 829    * internal subset. Such entity declarations are not allowed to contain
 830    * references to parameter entities.
 831    * 'init_is_parameter_entity': whether this is a parameter entity or not
 832    *)
 833
 834   object (self)
 835     inherit entity
 836               the_dtd the_name the_warner init_errors_with_line_numbers
 837               init_encoding
 838             as super
 839
 840     val p_internal_subset = the_p_internal_subset
 841
 842     val mutable replacement_text = ""
 843     val mutable contains_external_references = false
 844     val mutable p_parsed_actually = false
 845     val mutable is_open = false
 846     val mutable state = At_beginning
 847     val mutable is_parameter_entity = init_is_parameter_entity
 848
 849
 850     initializer
 851     let lexbuf = Lexing.from_string the_literal_value in
 852     let rec scan_and_expand () =
 853       match lexerset.scan_dtd_string lexbuf with
 854           ERef n -> "&" ^ n ^ ";" ^ scan_and_expand()
 855         | CRef(-1) -> "\r\n" ^ scan_and_expand()
 856         | CRef(-2) -> "\r" ^ scan_and_expand()
 857         | CRef(-3) -> "\n" ^ scan_and_expand()
 858         | CRef k -> character encoding warner k ^ scan_and_expand()
 859         | CharData x -> x ^ scan_and_expand()
 860         | PERef n ->
 861             if p_internal_subset then
 862               raise(WF_error("Restriction of the internal subset: parameter entity not allowed here"));
 863             let en = dtd # par_entity n in
 864             let (x, extref) = en # replacement_text in
 865             contains_external_references <-
 866               contains_external_references or extref;
 867             x ^ scan_and_expand()
 868         | Eof ->
 869             ""
 870         | _ ->
 871             assert false
 872     in
 873     is_open <- true;
 874     replacement_text <- scan_and_expand();
 875     is_open <- false;
 876     normalize_newline <- false;
 877     counts_as_external <- false;
 878
 879
 880     method process_xmldecl (pl:prolog_token list) =
 881       raise(Validation_error("The encoding cannot be changed in internal entities"))
 882
 883
 884     method process_missing_xmldecl =
 885       ()
 886
 887
 888     method private set_encoding e =
 889       (* Ignored if e = "" *)
 890       assert(e = "");
 891
 892
 893     method open_entity force_parsing init_lex_id =
 894       if is_open then
 895         raise(Validation_error("Recursive reference to entity `" ^ name ^ "'"));
 896
 897       p_parsed_actually <- force_parsing;
 898       lexbuf  <- Lexing.from_string
 899                    (if is_parameter_entity then
 900                       (" " ^ replacement_text ^ " ")
 901                     else
 902                       replacement_text);
 903       prolog  <- None;
 904       lex_id  <- init_lex_id;
 905       state <- At_beginning;
 906       is_open <- true;
 907       line <- 1;
 908       column <- 0;
 909       pos <- 0;
 910       last_token <- Eof;
 911
 912
 913     method private handle_bof tok =
 914       (* This hook is only called if the stream is not empty. *)
 915       if p_parsed_actually then begin
 916         deferred_token <- Some [ tok ];
 917         state <- Inserted_begin_entity;
 918         Begin_entity
 919       end
 920       else begin
 921         state <- At_end;
 922         tok
 923       end
 924
 925
 926     method private handle_eof =
 927       (* This hook is called if the end of  the stream is reached *)
 928       match state with
 929           At_beginning ->
 930             (* This is only possible if the stream is empty. *)
 931             if p_parsed_actually then begin
 932               (* Insert Begin_entity / End_entity *)
 933               deferred_token <- Some [ End_entity ];
 934               state <- At_end;
 935               Begin_entity;
 936               (* After these two token have been processed, the lexer
 937                * is called again, and it will return another Eof.
 938                *)
 939             end
 940             else begin
 941               (* Continue immediately with the next token *)
 942               state <- At_end;
 943               super # handle_eof
 944             end
 945         | Inserted_begin_entity ->
 946             (* Insert End_entity, too. *)
 947             state <- At_end;
 948             End_entity;
 949         | At_end ->
 950             (* Continue with the next token: *)
 951             super # handle_eof
 952
 953
 954     method close_entity =
 955       if not is_open then
 956         failwith ("Internal entity " ^ name ^ " not open");
 957       is_open <- false;
 958       lex_id
 959
 960
 961     method replacement_text =
 962       if is_open then
 963         raise(Validation_error("Recursive reference to entity `" ^ name ^ "'"));
 964       replacement_text, contains_external_references
 965   end
 966 ;;
 967
 968 (**********************************************************************)
 969
 970 (* An 'entity_manager' is a stack of entities, where the topmost entity
 971  * is the currently active entity, the second entity is the entity that
 972  * referred to the active entity, and so on.
 973  *
 974  * The entity_manager can communicate with the currently active entity.
 975  *
 976  * The entity_manager provides an interface for the parser; the functions
 977  * returning the current token and the next token are exported.
 978  *)
 979
 980 class entity_manager (init_entity : entity) =
 981   object (self)
 982     val mutable entity_stack = Stack.create()
 983     val mutable current_entity = init_entity
 984     val mutable current_entity's_full_name = lazy (init_entity # full_name)
 985
 986     val mutable yy_get_next_ref = ref (fun () -> assert false)
 987
 988     initializer
 989       init_entity # set_manager (self :>
 990                                  < current_entity : entity;
 991                                    pop_entity : unit;
 992                                    push_entity : entity -> unit >
 993                                 );
 994       yy_get_next_ref := (fun () -> init_entity # next_token)
 995
 996     method push_entity e =
 997       e # set_manager (self :>
 998                        < current_entity : entity;
 999                          pop_entity : unit;
1000                          push_entity : entity -> unit >
1001                       );
1002       Stack.push (current_entity, current_entity's_full_name) entity_stack;
1003       current_entity <- e;
1004       current_entity's_full_name <- lazy (e # full_name);
1005       yy_get_next_ref := (fun () -> e # next_token);
1006
1007     method pop_entity =
1008       (* May raise Stack.Empty *)
1009       let e, e_name = Stack.pop entity_stack in
1010       current_entity <- e;
1011       current_entity's_full_name <- e_name;
1012       yy_get_next_ref := (fun () -> e # next_token);
1013
1014
1015
1016     method position_string =
1017       (* Gets a string describing the position of the last token;
1018        * includes an entity backtrace
1019        *)
1020       let b = Buffer.create 200 in
1021       Buffer.add_string b
1022         ("In entity " ^ current_entity # full_name
1023          ^ ", at line " ^ string_of_int (current_entity # line)
1024          ^ ", position " ^ string_of_int (current_entity # column)
1025          ^ ":\n");
1026       Stack.iter
1027         (fun (e, e_name) ->
1028            Buffer.add_string b
1029              ("Called from entity " ^ Lazy.force e_name
1030               ^ ", line " ^ string_of_int (e # line)
1031               ^  ", position " ^ string_of_int (e # column)
1032               ^ ":\n");
1033         )
1034         entity_stack;
1035       Buffer.contents b
1036
1037
1038     method position =
1039       (* Returns the triple (full_name, line, column) of the last token *)
1040       Lazy.force current_entity's_full_name,
1041       current_entity # line,
1042       current_entity # column
1043
1044
1045     method current_entity_counts_as_external =
1046       (* Whether the current entity counts as external to the main
1047        * document for the purpose of stand-alone checks.
1048        *)
1049       (* TODO: improve performance *)
1050       let is_external = ref false in
1051       let check (e, _) =
1052         if e # counts_as_external then begin
1053           is_external := true;
1054         end;
1055       in
1056       check (current_entity,());
1057       Stack.iter check entity_stack;
1058       !is_external
1059
1060
1061     method current_entity  = current_entity
1062
1063     method yy_get_next_ref = yy_get_next_ref
1064
1065   end
1066 ;;
1067
1068
1069
1070 (* ======================================================================
1071  * History:
1072  *
1073  * $Log$
1074  * Revision 1.1  2000/11/17 09:57:29  lpadovan
1075  * Initial revision
1076  *
1077  * Revision 1.6  2000/07/14 13:55:00  gerd
1078  *      Cosmetic changes.
1079  *
1080  * Revision 1.5  2000/07/09 17:51:50  gerd
1081  *      Entities return now the beginning of a token as its
1082  * position.
1083  *      New method 'position' for entity_manager.
1084  *
1085  * Revision 1.4  2000/07/09 01:05:04  gerd
1086  *      Exported methods 'ext_id' and 'notation' anyway.
1087  *
1088  * Revision 1.3  2000/07/08 16:28:05  gerd
1089  *      Updated: Exception 'Not_resolvable' is taken into account.
1090  *
1091  * Revision 1.2  2000/07/04 22:12:47  gerd
1092  *      Update: Case ext_id = Anonymous.
1093  *      Update: Handling of the exception Not_competent when reading
1094  * from a resolver.
1095  *
1096  * Revision 1.1  2000/05/29 23:48:38  gerd
1097  *      Changed module names:
1098  *              Markup_aux          into Pxp_aux
1099  *              Markup_codewriter   into Pxp_codewriter
1100  *              Markup_document     into Pxp_document
1101  *              Markup_dtd          into Pxp_dtd
1102  *              Markup_entity       into Pxp_entity
1103  *              Markup_lexer_types  into Pxp_lexer_types
1104  *              Markup_reader       into Pxp_reader
1105  *              Markup_types        into Pxp_types
1106  *              Markup_yacc         into Pxp_yacc
1107  * See directory "compatibility" for (almost) compatible wrappers emulating
1108  * Markup_document, Markup_dtd, Markup_reader, Markup_types, and Markup_yacc.
1109  *
1110  * ======================================================================
1111  * Old logs from markup_entity.ml:
1112  *
1113  * Revision 1.27  2000/05/29 21:14:57  gerd
1114  *      Changed the type 'encoding' into a polymorphic variant.
1115  *
1116  * Revision 1.26  2000/05/28 17:24:55  gerd
1117  *      Bugfixes.
1118  *
1119  * Revision 1.25  2000/05/27 19:23:32  gerd
1120  *      The entities store whether they count as external with
1121  * respect to the standalone check: New methods counts_as_external
1122  * and set_counts_as_external.
1123  *      The entity manager can find out whether the current
1124  * entity counts as external: method current_entity_counts_as_external.
1125  *
1126  * Revision 1.24  2000/05/20 20:31:40  gerd
1127  *      Big change: Added support for various encodings of the
1128  * internal representation.
1129  *
1130  * Revision 1.23  2000/05/14 21:51:24  gerd
1131  *      Change: Whitespace is handled by the grammar, and no longer
1132  * by the entity.
1133  *
1134  * Revision 1.22  2000/05/14 17:50:54  gerd
1135  *      Updates because of changes in the token type.
1136  *
1137  * Revision 1.21  2000/05/09 00:02:44  gerd
1138  *      Conditional sections are now recognized by the parser.
1139  * There seem some open questions; see the TODO comments!
1140  *
1141  * Revision 1.20  2000/05/08 21:58:22  gerd
1142  *      Introduced entity_manager as communication object between
1143  * the parser and the currently active entity.
1144  *      New hooks handle_bof and handle_eof.
1145  *      Removed "delegated entities". The entity manager contains
1146  * the stack of open entities.
1147  *      Changed the way Begin_entity and End_entity are inserted.
1148  * This is now done by handle_bof and handle_eof.
1149  *      The XML declaration is no longer detected by the entity.
1150  * This is now done by the parser.
1151  *
1152  * Revision 1.19  2000/05/01 15:18:44  gerd
1153  *      Improved CRLF handling in the replacement text of entities.
1154  *      Changed one error message.
1155  *
1156  * Revision 1.18  2000/04/30 18:18:39  gerd
1157  *      Bugfixes: The conversion of CR and CRLF to LF is now hopefully
1158  * done right. The new variable "normalize_newline" indicates whether
1159  * normalization must happen for that type of entity. The normalization
1160  * if actually carried out separately for every token that needs it.
1161  *
1162  * Revision 1.17  2000/03/13 23:42:38  gerd
1163  *      Removed the resolver classes, and put them into their
1164  * own module (Markup_reader).
1165  *
1166  * Revision 1.16  2000/02/22 01:06:58  gerd
1167  *      Bugfix: Resolvers are properly re-initialized. This bug caused
1168  * that entities could not be referenced twice in the same document.
1169  *
1170  * Revision 1.15  2000/01/20 20:54:11  gerd
1171  *      New config.errors_with_line_numbers.
1172  *
1173  * Revision 1.14  2000/01/08 18:59:03  gerd
1174  *      Corrected the string resolver.
1175  *
1176  * Revision 1.13  1999/09/01 22:58:23  gerd
1177  *      Method warn_not_latin1 raises Illegal_character if the character
1178  * does not match the Char production.
1179  *      External entities that are not document entities check if the
1180  * <?xml...?> declaration at the beginning matches the TextDecl production.
1181  *      Method xml_declaration has type ... list option, not ... list.
1182  *      Tag_beg and Tag_end now carry an entity_id with them.
1183  *      The code to check empty entities has changed. That the Begin_entity/
1184  * End_entity pair is not to be added must be explicitly turned on. See the
1185  * description of empty entity handling in design.txt.
1186  *      In internal subsets entity declarations are not allowed to refer
1187  * to parameter entities. The internal_entity class can do this now.
1188  *      The p_parsed parameter of internal_entity has gone. It was simply
1189  * superflous.
1190  *
1191  * Revision 1.12  1999/09/01 16:24:13  gerd
1192  *      The method replacement_text returns the text as described for
1193  * "included in literal". The former behaviour has been dropped to include
1194  * a leading and a trailing space character for parameter entities.
1195  *      Bugfix: When general entities are included, they are always parsed.
1196  *
1197  * Revision 1.11  1999/08/31 19:13:31  gerd
1198  *      Added checks on proper PE nesting. The idea is that tokens such
1199  * as Decl_element and Decl_rangle carry an entity ID with them. This ID
1200  * is simply an object of type < >, i.e. you can only test on identity.
1201  * The lexer always produces tokens with a dummy ID because it does not
1202  * know which entity is the current one. The entity layer replaces the dummy
1203  * ID with the actual ID. The parser checks that the IDs of pairs such as
1204  * Decl_element and Decl_rangle are the same; otherwise a Validation_error
1205  * is produced.
1206  *
1207  * Revision 1.10  1999/08/19 01:06:41  gerd
1208  *      Improved error messages: external entities print their
1209  * ext id, too
1210  *
1211  * Revision 1.9  1999/08/15 20:35:48  gerd
1212  *      Improved error messages.
1213  *      Before the tokens Plus, Star, Qmark space is not allowed any longer.
1214  *      Detection of recursive entity references is a bit cleaner.
1215  *
1216  * Revision 1.8  1999/08/15 15:33:44  gerd
1217  *      Revised whitespace checking: At certain positions there must be
1218  * white space. These checks cannot be part of the lexer, as %entity; counts
1219  * as white space. They cannot be part of the yacc parser because one look-ahead
1220  * token would not suffice if we did that. So these checks must be done by the
1221  * entity layer. Luckily, the rules are simple: There are simply a number of
1222  * token pairs between which white space must occur independently of where
1223  * these token have been found. Two variables, "space_seen", and "last_token"
1224  * have been added in order to check these rules.
1225  *
1226  * Revision 1.7  1999/08/15 00:41:06  gerd
1227  *      The [ token of conditional sections is now allowed to occur
1228  * in a different entity.
1229  *
1230  * Revision 1.6  1999/08/15 00:29:02  gerd
1231  *      The method "attlist_replacement_text" has gone. There is now a
1232  * more general "replacement_text" method that computes the replacement
1233  * text for both internal and external entities. Additionally, this method
1234  * returns whether references to external entities have been resolved;
1235  * this is checked in the cases where formerly "attlist_replacement_text"
1236  * was used as it is not allowed everywhere.
1237  *      Entities have a new slot "need_spaces" that indicates that the
1238  * next token must be white space or a parameter reference. The problem
1239  * was that "<!ATTLIST%e;" is legal because when including parameter
1240  * entities white space is added implicitly. Formerly, the white space
1241  * was expected by the underlying lexer; now the lexer does not check
1242  * anymore that "<!ATTLIST" is followed by white space because the lexer
1243  * cannot handle parameter references. Because of this, the check on
1244  * white space must be done by the entity.
1245  *
1246  * Revision 1.5  1999/08/14 22:57:19  gerd
1247  *      It is allowed that external entities are empty because the
1248  * empty string is well-parsed for both declarations and contents. Empty
1249  * entities can be referenced anywhere because the references are replaced
1250  * by nothing. Because of this, the Begin_entity...End_entity brace is only
1251  * inserted if the entity is non-empty. (Otherwise references to empty
1252  * entities would not be allowed anywhere.)
1253  *      As a consequence, the grammar has been changed such that a
1254  * single Eof is equivalent to Begin_entity,End_entity without content.
1255  *
1256  * Revision 1.4  1999/08/14 22:11:19  gerd
1257  *         Several objects have now a "warner" as argument which is
1258  * an object with a "warn" method. This is used to warn about characters
1259  * that cannot be represented in the Latin 1 alphabet.
1260  *      Previously, the resolvers had features in order to warn about
1261  * such characters; this has been removed.
1262  *      UTF-8 streams can be read even if they contain characters
1263  * that cannot be represented by 16 bits.
1264  *      The buffering used in the resolvers is now solved in a
1265  * cleaner way; the number of characters that are expected to be read
1266  * from a source can be limited. This removes a bug with UTF-16 streams
1267  * that previously lead to wrong exceptions; and the buffering is more
1268  * efficient, too.
1269  *
1270  * Revision 1.3  1999/08/11 14:58:53  gerd
1271  *      Some more names for encodings are allowed, such as "utf8" instead
1272  * of the standard name "UTF-8".
1273  *      'resolve_as_file' interprets relative file names as relative to
1274  * the "parent" resolver.
1275  *
1276  * Revision 1.2  1999/08/10 21:35:07  gerd
1277  *      The XML/encoding declaration at the beginning of entities is
1278  * evaluated. In particular, entities have now a method "xml_declaration"
1279  * which returns the name/value pairs of such a declaration. The "encoding"
1280  * setting is interpreted by the entity itself; "version", and "standalone"
1281  * are interpreted by Markup_yacc.parse_document_entity. Other settings
1282  * are ignored (this does not conform to the standard; the standard prescribes
1283  * that "version" MUST be given in the declaration of document; "standalone"
1284  * and "encoding" CAN be declared; no other settings are allowed).
1285  *      TODO: The user should be warned if the standard is not exactly
1286  * fulfilled. -- The "standalone" property is not checked yet.
1287  *
1288  * Revision 1.1  1999/08/10 00:35:51  gerd
1289  *      Initial revision.
1290  *
1291  *
1292  *)