2 * ----------------------------------------------------------------------
3 * PXP: The polymorphic XML parser for Objective Caml.
4 * Copyright by Gerd Stolpmann. See LICENSE for details.
9 * - Wie verhindert man, dass ein internal entity eine XML-Dekl. im
10 * replacement text akzeptiert?
19 (* Hierarchy of parsing layers:
22 * + gets input stream from the main entity object
23 * + checks most of the grammar
24 * + creates the DTD object as side-effect
25 * + creates the element tree as side-effect
26 * + creates further entity objects that are entered into the DTD
27 * - Entity layer: Pxp_entity
28 * + gets input stream from the lexers, or another entity object
29 * + handles entity references: if a reference is encountered the
30 * input stream is redirected such that the tokens come from the
31 * referenced entity object
32 * + handles conditional sections
33 * - Lexer layer: Pxp_lexers
34 * + gets input from lexbuffers created by resolvers
35 * + different lexers for different lexical contexts
36 * + a lexer returns pairs (token,lexid), where token is the scanned
37 * token, and lexid is the name of the lexer that must be used for
39 * - Resolver layer: Pxp_entity
40 * + a resolver creates the lexbuf from some character source
41 * + a resolver recodes the input and handles the encoding scheme
44 (**********************************************************************)
46 (* Variables of type 'state' are used to insert Begin_entity and End_entity
47 * tokens into the stream.
48 * - At_beginning: Nothing has been read so far
49 * - First_token tok: A Begin_entity has been inserted; and the next token
50 * is 'tok' which is not Eof. (Begin_entity/End_entity must not be inserted
51 * if the entity is empty.)
52 * - In_stream: After the first token has been read, but befor Eof.
53 * - At_end: Eof has been read, and End_entity has been returned.
58 | Inserted_begin_entity
63 (**********************************************************************)
65 class virtual entity the_dtd the_name the_warner
66 init_errors_with_line_numbers init_encoding =
68 (* This class prescribes the type of all entity objects. Furthermore,
69 * the default 'next_token' mechanism is implemented.
72 (* 'init_errors_with_line_numbers': whether error messages contain line
74 * Calculating line numbers is expensive.
77 val mutable dtd = the_dtd
78 val mutable name = the_name
79 val mutable warner = the_warner
81 val encoding = (init_encoding : rep_encoding)
82 val lexerset = Pxp_lexers.get_lexer_set init_encoding
84 method encoding = encoding
85 (* method lexerset = lexerset *)
87 val mutable manager = None
88 (* The current entity_manager, see below *)
90 method private manager =
94 : < current_entity : entity;
96 push_entity : entity -> unit >
99 method set_manager m = manager <- Some m
102 val mutable lexbuf = Lexing.from_string ""
103 (* The lexical buffer currently used as character source. *)
105 val mutable prolog = None
106 (* Stores the initial <?xml ...?> token as PI_xml *)
108 val mutable prolog_pairs = []
109 (* If prolog <> None, these are the (name,value) pairs of the
110 * processing instruction.
114 val mutable lex_id = Document
115 (* The name of the lexer that should be used for the next token *)
117 method set_lex_id id = lex_id <- lex_id
121 val mutable force_parameter_entity_parsing = false
122 (* 'true' forces that inner entities will always be embraced by
123 * Begin_entity and End_entity.
124 * 'false': the inner entity itself decides this
127 val mutable check_text_declaration = true
128 (* 'true': It is checked that the <?xml..?> declaration matches the
129 * production TextDecl.
132 val mutable normalize_newline = true
133 (* Whether this entity converts CRLF or CR to LF, or not *)
136 val mutable line = 1 (* current line *)
137 val mutable column = 0 (* current column *)
138 val mutable pos = 0 (* current absolute character position *)
139 val errors_with_line_numbers = init_errors_with_line_numbers
141 val mutable p_line = 1
142 val mutable p_column = 1
145 method column = p_column
148 val mutable counts_as_external = false
150 method counts_as_external = counts_as_external
151 (* Whether the entity counts as external (for the standalone check). *)
153 method set_counts_as_external =
154 counts_as_external <- true
157 val mutable last_token = Bof
159 * These two variables are used to check that between certain pairs of
160 * tokens whitespaces exist. 'last_token' is simply the last token,
161 * but not Ignore, and not PERef (which both represent whitespace).
162 * 'space_seen' records whether Ignore or PERef was seen between this
163 * token and 'last_token'.
166 val mutable deferred_token = None
167 (* If you set this to Some tl, the next invocations of
168 * next_token_from_entity will return the tokens in tl.
169 * This makes it possible to insert tokens into the stream.
172 val mutable debug = false
174 method is_ndata = false
175 (* Returns if this entity is an NDATA (unparsed) entity *)
179 method virtual open_entity : bool -> lexers -> unit
180 (* open_entity force_parsing lexid:
181 * opens the entity, and the first token is scanned by the lexer
182 * 'lexid'. 'force_parsing' forces that Begin_entity and End_entity
183 * tokens embrace the inner tokens of the entity; otherwise this
184 * depends on the entity.
185 * By opening an entity, reading tokens from it, and finally closing
186 * the entity, the inclusion methods "Included",
187 * "Included if validating", and "Included as PE" can be carried out.
188 * Which method is chosen depends on the 'lexid', i.e. the lexical
189 * context: 'lexid = Content' performs "Included (if validating)" (we
190 * are always validating); 'lexid = Declaration' performs
191 * "Included as PE". The difference is which tokens are recognized,
192 * and how spaces are handled.
193 * 'force_parsing' causes that a Begin_entity token is inserted before
194 * and an End_entity token is inserted after the entity. The yacc
195 * rules allow the Begin_entity ... End_entity brace only at certain
196 * positions; this is used to restrict the possible positions where
197 * entities may be included, and to guarantee that the entity matches
198 * a certain production of the grammar ("parsed entities").
199 * 'open_entity' is currently invoked with 'force_parsing = true'
200 * for toplevel nodes, for inclusion of internal general entities,
201 * and for inclusion of parameter entities into document entities.
202 * 'force_parsing = false' is used for all other cases: External
203 * entities add the Begin_entity/End_entity tokens anyway; internal
204 * entities do not. Especially internal parameter entities referenced
205 * from non-document entities do not add these tokens.
208 method virtual close_entity : lexers
210 * closes the entity and returns the name of the lexer that must
211 * be used to scan the next token.
214 method virtual replacement_text : (string * bool)
216 * returns the replacement text of the entity, and as second value,
217 * whether the replacement text was constructed by referencing
218 * external entities (directly or indirectly).
219 * This method implements the inclusion method "Included in Literal".
223 method lexbuf = lexbuf
226 method xml_declaration =
227 (* return the (name,value) pairs of the initial <?xml name=value ...?>
228 * processing instruction.
237 method set_debugging_mode m =
240 method private virtual set_encoding : string -> unit
248 (* read next token from this entity *)
250 match deferred_token with
254 deferred_token <- None;
257 deferred_token <- Some toklist';
259 prerr_endline ("- Entity " ^ name ^ ": " ^ string_of_tok tok ^ " (deferred)");
264 and this_column = column in
265 let this_pos = pos in
267 p_column <- this_column;
268 (* Read the next token from the appropriate lexer lex_id, and get the
269 * name lex_id' of the next lexer to be used.
273 Document -> lexerset.scan_document lexbuf
274 | Document_type -> lexerset.scan_document_type lexbuf
275 | Content -> lexerset.scan_content lexbuf
276 | Within_tag -> lexerset.scan_within_tag lexbuf
277 | Declaration -> lexerset.scan_declaration lexbuf
278 | Content_comment -> lexerset.scan_content_comment lexbuf
279 | Decl_comment -> lexerset.scan_decl_comment lexbuf
280 | Document_comment -> lexerset.scan_document_comment lexbuf
281 | Ignored_section -> assert false
282 (* Ignored_section: only used by method next_ignored_token *)
285 prerr_endline ("- Entity " ^ name ^ ": " ^ string_of_tok tok);
286 (* Find out the number of lines and characters of the last line: *)
287 let n_lines, n_columns =
288 if errors_with_line_numbers then
289 count_lines (Lexing.lexeme lexbuf)
291 0, (Lexing.lexeme_end lexbuf - Lexing.lexeme_start lexbuf)
293 line <- this_line + n_lines;
294 column <- if n_lines = 0 then this_column + n_columns else n_columns;
295 pos <- Lexing.lexeme_end lexbuf;
297 (* Throw Ignore and Comment away; Interpret entity references: *)
298 (* NOTE: Of course, references to general entities are not allowed
299 * everywhere; parameter references, too. This is already done by the
300 * lexers, i.e. &name; and %name; are recognized only where they
304 (* TODO: last_token is only used to detect Bof. Can be simplified *)
306 let at_bof = (last_token = Bof) in
312 (* Entity references: *)
315 let en, extdecl = dtd # gen_entity n in
316 if dtd # standalone_declaration && extdecl then
319 ("Reference to entity `" ^ n ^
320 "' violates standalone declaration"));
321 en # set_debugging_mode debug;
322 en # open_entity true lex_id;
323 self # manager # push_entity en;
326 let en = dtd # par_entity n in
327 en # set_debugging_mode debug;
328 en # open_entity force_parameter_entity_parsing lex_id;
329 self # manager # push_entity en;
332 (* Convert LineEnd to CharData *)
334 if normalize_newline then
339 (* Also normalize CDATA sections *)
340 | Cdata value as cd ->
341 if normalize_newline then
342 Cdata(normalize_line_separators lexerset value)
346 (* If there are CRLF sequences in a PI value, normalize them, too *)
347 | PI(name,value) as pi ->
348 if normalize_newline then
349 PI(name, normalize_line_separators lexerset value)
353 (* Attribute values: If they are already normalized, they are turned
354 * into Attval_nl_normalized. This is detected by other code.
356 | Attval value as av ->
357 if normalize_newline then
360 Attval_nl_normalized value
362 (* Another CRLF normalization case: Unparsed_string *)
363 | Unparsed_string value as ustr ->
364 if normalize_newline then
365 Unparsed_string(normalize_line_separators lexerset value)
369 (* These tokens require that the entity_id parameter is set: *)
370 | Doctype _ -> Doctype (self :> entity_id)
371 | Doctype_rangle _ ->Doctype_rangle(self :> entity_id)
372 | Dtd_begin _ -> Dtd_begin (self :> entity_id)
373 | Dtd_end _ -> Dtd_end (self :> entity_id)
374 | Decl_element _ -> Decl_element (self :> entity_id)
375 | Decl_attlist _ -> Decl_attlist (self :> entity_id)
376 | Decl_entity _ -> Decl_entity (self :> entity_id)
377 | Decl_notation _ ->Decl_notation (self :> entity_id)
378 | Decl_rangle _ -> Decl_rangle (self :> entity_id)
379 | Lparen _ -> Lparen (self :> entity_id)
380 | Rparen _ -> Rparen (self :> entity_id)
381 | RparenPlus _ -> RparenPlus (self :> entity_id)
382 | RparenStar _ -> RparenStar (self :> entity_id)
383 | RparenQmark _ -> RparenQmark (self :> entity_id)
384 | Conditional_begin _ -> Conditional_begin (self :> entity_id)
385 | Conditional_body _ -> Conditional_body (self :> entity_id)
386 | Conditional_end _ -> Conditional_end (self :> entity_id)
387 | Tag_beg (n,_) -> Tag_beg (n, (self :> entity_id))
388 | Tag_end (n,_) -> Tag_end (n, (self :> entity_id))
394 prerr_endline ("- Entity " ^ name ^ " # handle_eof");
395 let tok = self # handle_eof in
396 prerr_endline ("- Entity " ^ name ^ " # handle_eof: returns " ^ string_of_tok tok);
402 (* The default case. *)
408 if at_bof & tok <> Eof
411 prerr_endline ("- Entity " ^ name ^ " # handle_bof");
412 self # handle_bof tok'
419 (* 'handle_bof' and 'handle_eof' can be used as hooks. Behaviour:
421 * - Normally, the first token t is read in, and 'handle_bof t' is
422 * called. The return value of this method is what is returned to
424 * - If the EOF has been reached, 'handle_eof' is called.
425 * - BUT: If the first token is already EOF, 'handle_eof' is called
426 * ONLY, and 'handle_bof' is NOT called.
428 * The default implementations:
429 * - handle_bof: does nothing
430 * - handle_eof: Pops the previous entity from the stack, switches back
431 * to this entity, and returns the next token of this entity.
435 method private handle_bof tok =
439 method private handle_eof =
440 let mng = self # manager in
443 let next_lex_id = self # close_entity in
444 let en = mng # current_entity in
445 en # set_lex_id next_lex_id;
449 (* The outermost entity is at EOF *)
454 method next_ignored_token =
455 (* used after <![ IGNORE *)
457 (* TODO: Do we need a test on deferred tokens here? *)
460 and this_column = column in
461 let this_pos = pos in
462 let tok, lex_id' = lexerset.scan_ignored_section lexbuf in
464 prerr_endline ("- Entity " ^ name ^ ": " ^ string_of_tok tok ^ " (Ignored)");
465 let n_lines, n_columns = count_lines (Lexing.lexeme lexbuf) in
466 line <- this_line + n_lines;
467 column <- if n_lines = 0 then this_column + n_columns else n_columns;
468 pos <- Lexing.lexeme_end lexbuf;
470 | Conditional_begin _ -> Conditional_begin (self :> entity_id)
471 | Conditional_end _ -> Conditional_end (self :> entity_id)
475 method process_xmldecl pl =
476 (* The parser calls this method just after the XML declaration
477 * <?xml ...?> has been detected.
478 * 'pl': This is the argument of the PI_xml token.
481 prerr_endline ("- Entity " ^ name ^ " # process_xmldecl");
483 prolog_pairs <- decode_xml_pi pl;
484 if check_text_declaration then
485 check_text_xml_pi prolog_pairs;
488 let e = List.assoc "encoding" prolog_pairs in
489 self # set_encoding e
492 self # set_encoding ""
496 method process_missing_xmldecl =
497 (* The parser calls this method if the XML declaration is missing *)
499 prerr_endline ("- Entity " ^ name ^ " # process_missing_xmldecl");
500 self # set_encoding ""
503 (* Methods for NDATA entities only: *)
504 method ext_id = (assert false : ext_id)
505 method notation = (assert false : string)
511 class ndata_entity the_name the_ext_id the_notation init_encoding =
513 (* An NDATA entity is very restricted; more or less you can only find out
514 * its external ID and its notation.
517 val mutable name = the_name
518 val mutable ext_id = the_ext_id
519 val mutable notation = the_notation
520 val encoding = (init_encoding : rep_encoding)
522 method name = (name : string)
523 method ext_id = (ext_id : ext_id)
524 method notation = (notation : string)
526 method is_ndata = true
528 method encoding = encoding
531 val mutable counts_as_external = false
533 method counts_as_external = counts_as_external
534 (* Whether the entity counts as external (for the standalone check). *)
536 method set_counts_as_external =
537 counts_as_external <- true
540 method set_manager (m : < current_entity : entity;
542 push_entity : entity -> unit >) =
543 ( raise (Validation_error ("Invalid reference to NDATA entity " ^ name))
546 method set_lex_id (id : lexers) =
547 ( raise (Validation_error ("Invalid reference to NDATA entity " ^ name))
551 ( raise (Validation_error ("Invalid reference to NDATA entity " ^ name))
555 ( raise (Validation_error ("Invalid reference to NDATA entity " ^ name))
559 ( raise (Validation_error ("Invalid reference to NDATA entity " ^ name))
562 method private set_encoding (_:string) =
565 method xml_declaration = (None : (string*string) list option)
567 method set_debugging_mode (_:bool) = ()
569 method open_entity (_:bool) (_:lexers) =
570 ( raise (Validation_error ("Invalid reference to NDATA entity " ^ name))
573 method close_entity =
574 ( raise (Validation_error ("Invalid reference to NDATA entity " ^ name))
577 method replacement_text =
578 ( raise (Validation_error ("Invalid reference to NDATA entity " ^ name))
582 ( raise (Validation_error ("Invalid reference to NDATA entity " ^ name))
586 ( raise (Validation_error ("Invalid reference to NDATA entity " ^ name))
589 method next_ignored_token =
590 ( raise (Validation_error ("Invalid reference to NDATA entity " ^ name))
593 method process_xmldecl (pl:prolog_token list) =
594 ( raise (Validation_error ("Invalid reference to NDATA entity " ^ name))
597 method process_missing_xmldecl =
598 ( raise (Validation_error ("Invalid reference to NDATA entity " ^ name))
605 class external_entity the_resolver the_dtd the_name the_warner the_ext_id
606 the_p_special_empty_entities
607 init_errors_with_line_numbers
612 the_dtd the_name the_warner init_errors_with_line_numbers
616 (* An external entity gets the lexbuf that is used as character source
618 * Furthermore, before the first token an Begin_entity is inserted, and
619 * before Eof an End_entity token is inserted into the stream. This done
620 * always regardless of the argument 'force_parsing' of the method
623 * 'the_p_internal_subset': see class internal_entity
624 * 'the_p_special_empty_entities': if true, a Begin_entity/End_entity
625 * brace is left out if the entity is otherwise empty.
628 val resolver = (the_resolver : resolver)
629 val ext_id = (the_ext_id : ext_id)
631 val p_special_empty_entities = (the_p_special_empty_entities : bool)
633 val mutable resolver_is_open = false
634 (* Track if the resolver is open. This is also used to find recursive
635 * references of entities.
638 val mutable state = At_beginning
641 counts_as_external <- true;
644 method private set_encoding e =
645 assert resolver_is_open;
646 resolver # change_encoding e
652 System s -> " = SYSTEM \"" ^ s ^ "\""
653 | Public(p,s) -> " = PUBLIC \"" ^ p ^ "\" \"" ^ s ^ "\""
654 | Anonymous -> " = ANONYMOUS"
657 method open_entity force_parsing init_lex_id =
658 (* Note that external entities are always parsed, i.e. Begin_entity
659 * and End_entity tokens embrace the inner tokens to force that
660 * the entity is only called where the syntax allows it.
662 if resolver_is_open then
663 raise(Validation_error("Recursive reference to entity `" ^ name ^ "'"));
666 resolver # open_in ext_id
668 Pxp_reader.Not_competent ->
669 raise(Error ("No input method available for this external entity: " ^
671 | Pxp_reader.Not_resolvable Not_found ->
672 raise(Error ("Unable to open the external entity: " ^
674 | Pxp_reader.Not_resolvable e ->
675 raise(Error ("Unable to open the external entity: " ^
676 self # full_name ^ "; reason: " ^
679 resolver_is_open <- true;
682 lex_id <- init_lex_id;
683 state <- At_beginning;
688 normalize_newline <- true;
691 method private handle_bof tok =
692 (* This hook is only called if the stream is not empty. *)
693 deferred_token <- Some [ tok ];
694 state <- Inserted_begin_entity;
698 method private handle_eof =
699 (* This hook is called if the end of the stream is reached *)
702 (* This is only possible if the stream is empty. *)
703 if p_special_empty_entities then begin
704 (* Continue immediately with the next token *)
709 (* Insert Begin_entity / End_entity *)
710 deferred_token <- Some [ End_entity ];
713 (* After these two token have been processed, the lexer
714 * is called again, and it will return another Eof.
717 | Inserted_begin_entity ->
718 (* Insert End_entity, too. *)
722 (* Continue with the next token: *)
726 method close_entity =
727 if not resolver_is_open then
728 failwith ("External entity " ^ name ^ " not open");
730 resolver_is_open <- false;
734 method replacement_text =
735 (* Return the replacement text of the entity. The method used for this
736 * is more or less the same as for internal entities; i.e. character
737 * and parameter entities are resolved immediately. In addition to that,
738 * external entities may begin with an "xml" processing instruction
739 * which is considered not to be part of the replacement text.
741 if resolver_is_open then
742 raise(Validation_error("Recursive reference to entity `" ^ name ^ "'"));
743 let lex = resolver # open_in ext_id in
744 resolver_is_open <- true;
747 (* arbitrary: lex_id <- init_lex_id; *)
748 state <- At_beginning;
753 (* First check if the first token of 'lex' is <?xml...?> *)
754 begin match lexerset.scan_only_xml_decl lex with
756 self # process_xmldecl pl
758 (* This only means that the first token was not <?xml...?>;
759 * the "Eof" token represents the empty string.
761 self # process_missing_xmldecl
763 (* Must not happen. *)
766 (* Then create the replacement text. *)
767 let rec scan_and_expand () =
768 match lexerset.scan_dtd_string lexbuf with
769 ERef n -> "&" ^ n ^ ";" ^ scan_and_expand()
770 | CRef(-1) -> "\n" ^ scan_and_expand()
771 | CRef(-2) -> "\n" ^ scan_and_expand()
772 | CRef(-3) -> "\n" ^ scan_and_expand()
773 | CRef k -> character encoding warner k ^ scan_and_expand()
774 | CharData x -> x ^ scan_and_expand()
776 let en = dtd # par_entity n in
777 let (x,_) = en # replacement_text in
778 x ^ scan_and_expand()
784 let rtext = scan_and_expand() in
786 resolver_is_open <- false;
789 * - The replaced text is not parsed [VALIDATION WEAKNESS]
795 class document_entity the_resolver the_dtd the_name the_warner the_ext_id
796 init_errors_with_line_numbers
800 inherit external_entity the_resolver the_dtd the_name the_warner
801 the_ext_id false init_errors_with_line_numbers
804 (* A document entity is an external entity that does not allow
805 * conditional sections, and that forces that internal parameter entities
806 * are properly nested.
810 force_parameter_entity_parsing <- true;
811 check_text_declaration <- false;
813 method counts_as_external = false
814 (* Document entities count never as external! *)
819 class internal_entity the_dtd the_name the_warner the_literal_value
820 the_p_internal_subset init_errors_with_line_numbers
821 init_is_parameter_entity
824 (* An internal entity uses a "literal entity value" as character source.
825 * This value is first expanded and preprocessed, i.e. character and
826 * parameter references are expanded.
828 * 'the_p_internal_subset': indicates that the entity is declared in the
829 * internal subset. Such entity declarations are not allowed to contain
830 * references to parameter entities.
831 * 'init_is_parameter_entity': whether this is a parameter entity or not
836 the_dtd the_name the_warner init_errors_with_line_numbers
840 val p_internal_subset = the_p_internal_subset
842 val mutable replacement_text = ""
843 val mutable contains_external_references = false
844 val mutable p_parsed_actually = false
845 val mutable is_open = false
846 val mutable state = At_beginning
847 val mutable is_parameter_entity = init_is_parameter_entity
851 let lexbuf = Lexing.from_string the_literal_value in
852 let rec scan_and_expand () =
853 match lexerset.scan_dtd_string lexbuf with
854 ERef n -> "&" ^ n ^ ";" ^ scan_and_expand()
855 | CRef(-1) -> "\r\n" ^ scan_and_expand()
856 | CRef(-2) -> "\r" ^ scan_and_expand()
857 | CRef(-3) -> "\n" ^ scan_and_expand()
858 | CRef k -> character encoding warner k ^ scan_and_expand()
859 | CharData x -> x ^ scan_and_expand()
861 if p_internal_subset then
862 raise(WF_error("Restriction of the internal subset: parameter entity not allowed here"));
863 let en = dtd # par_entity n in
864 let (x, extref) = en # replacement_text in
865 contains_external_references <-
866 contains_external_references or extref;
867 x ^ scan_and_expand()
874 replacement_text <- scan_and_expand();
876 normalize_newline <- false;
877 counts_as_external <- false;
880 method process_xmldecl (pl:prolog_token list) =
881 raise(Validation_error("The encoding cannot be changed in internal entities"))
884 method process_missing_xmldecl =
888 method private set_encoding e =
889 (* Ignored if e = "" *)
893 method open_entity force_parsing init_lex_id =
895 raise(Validation_error("Recursive reference to entity `" ^ name ^ "'"));
897 p_parsed_actually <- force_parsing;
898 lexbuf <- Lexing.from_string
899 (if is_parameter_entity then
900 (" " ^ replacement_text ^ " ")
904 lex_id <- init_lex_id;
905 state <- At_beginning;
913 method private handle_bof tok =
914 (* This hook is only called if the stream is not empty. *)
915 if p_parsed_actually then begin
916 deferred_token <- Some [ tok ];
917 state <- Inserted_begin_entity;
926 method private handle_eof =
927 (* This hook is called if the end of the stream is reached *)
930 (* This is only possible if the stream is empty. *)
931 if p_parsed_actually then begin
932 (* Insert Begin_entity / End_entity *)
933 deferred_token <- Some [ End_entity ];
936 (* After these two token have been processed, the lexer
937 * is called again, and it will return another Eof.
941 (* Continue immediately with the next token *)
945 | Inserted_begin_entity ->
946 (* Insert End_entity, too. *)
950 (* Continue with the next token: *)
954 method close_entity =
956 failwith ("Internal entity " ^ name ^ " not open");
961 method replacement_text =
963 raise(Validation_error("Recursive reference to entity `" ^ name ^ "'"));
964 replacement_text, contains_external_references
968 (**********************************************************************)
970 (* An 'entity_manager' is a stack of entities, where the topmost entity
971 * is the currently active entity, the second entity is the entity that
972 * referred to the active entity, and so on.
974 * The entity_manager can communicate with the currently active entity.
976 * The entity_manager provides an interface for the parser; the functions
977 * returning the current token and the next token are exported.
980 class entity_manager (init_entity : entity) =
982 val mutable entity_stack = Stack.create()
983 val mutable current_entity = init_entity
984 val mutable current_entity's_full_name = lazy (init_entity # full_name)
986 val mutable yy_get_next_ref = ref (fun () -> assert false)
989 init_entity # set_manager (self :>
990 < current_entity : entity;
992 push_entity : entity -> unit >
994 yy_get_next_ref := (fun () -> init_entity # next_token)
996 method push_entity e =
997 e # set_manager (self :>
998 < current_entity : entity;
1000 push_entity : entity -> unit >
1002 Stack.push (current_entity, current_entity's_full_name) entity_stack;
1003 current_entity <- e;
1004 current_entity's_full_name <- lazy (e # full_name);
1005 yy_get_next_ref := (fun () -> e # next_token);
1008 (* May raise Stack.Empty *)
1009 let e, e_name = Stack.pop entity_stack in
1010 current_entity <- e;
1011 current_entity's_full_name <- e_name;
1012 yy_get_next_ref := (fun () -> e # next_token);
1016 method position_string =
1017 (* Gets a string describing the position of the last token;
1018 * includes an entity backtrace
1020 let b = Buffer.create 200 in
1022 ("In entity " ^ current_entity # full_name
1023 ^ ", at line " ^ string_of_int (current_entity # line)
1024 ^ ", position " ^ string_of_int (current_entity # column)
1029 ("Called from entity " ^ Lazy.force e_name
1030 ^ ", line " ^ string_of_int (e # line)
1031 ^ ", position " ^ string_of_int (e # column)
1039 (* Returns the triple (full_name, line, column) of the last token *)
1040 Lazy.force current_entity's_full_name,
1041 current_entity # line,
1042 current_entity # column
1045 method current_entity_counts_as_external =
1046 (* Whether the current entity counts as external to the main
1047 * document for the purpose of stand-alone checks.
1049 (* TODO: improve performance *)
1050 let is_external = ref false in
1052 if e # counts_as_external then begin
1053 is_external := true;
1056 check (current_entity,());
1057 Stack.iter check entity_stack;
1061 method current_entity = current_entity
1063 method yy_get_next_ref = yy_get_next_ref
1070 (* ======================================================================
1074 * Revision 1.1 2000/11/17 09:57:29 lpadovan
1077 * Revision 1.6 2000/07/14 13:55:00 gerd
1080 * Revision 1.5 2000/07/09 17:51:50 gerd
1081 * Entities return now the beginning of a token as its
1083 * New method 'position' for entity_manager.
1085 * Revision 1.4 2000/07/09 01:05:04 gerd
1086 * Exported methods 'ext_id' and 'notation' anyway.
1088 * Revision 1.3 2000/07/08 16:28:05 gerd
1089 * Updated: Exception 'Not_resolvable' is taken into account.
1091 * Revision 1.2 2000/07/04 22:12:47 gerd
1092 * Update: Case ext_id = Anonymous.
1093 * Update: Handling of the exception Not_competent when reading
1096 * Revision 1.1 2000/05/29 23:48:38 gerd
1097 * Changed module names:
1098 * Markup_aux into Pxp_aux
1099 * Markup_codewriter into Pxp_codewriter
1100 * Markup_document into Pxp_document
1101 * Markup_dtd into Pxp_dtd
1102 * Markup_entity into Pxp_entity
1103 * Markup_lexer_types into Pxp_lexer_types
1104 * Markup_reader into Pxp_reader
1105 * Markup_types into Pxp_types
1106 * Markup_yacc into Pxp_yacc
1107 * See directory "compatibility" for (almost) compatible wrappers emulating
1108 * Markup_document, Markup_dtd, Markup_reader, Markup_types, and Markup_yacc.
1110 * ======================================================================
1111 * Old logs from markup_entity.ml:
1113 * Revision 1.27 2000/05/29 21:14:57 gerd
1114 * Changed the type 'encoding' into a polymorphic variant.
1116 * Revision 1.26 2000/05/28 17:24:55 gerd
1119 * Revision 1.25 2000/05/27 19:23:32 gerd
1120 * The entities store whether they count as external with
1121 * respect to the standalone check: New methods counts_as_external
1122 * and set_counts_as_external.
1123 * The entity manager can find out whether the current
1124 * entity counts as external: method current_entity_counts_as_external.
1126 * Revision 1.24 2000/05/20 20:31:40 gerd
1127 * Big change: Added support for various encodings of the
1128 * internal representation.
1130 * Revision 1.23 2000/05/14 21:51:24 gerd
1131 * Change: Whitespace is handled by the grammar, and no longer
1134 * Revision 1.22 2000/05/14 17:50:54 gerd
1135 * Updates because of changes in the token type.
1137 * Revision 1.21 2000/05/09 00:02:44 gerd
1138 * Conditional sections are now recognized by the parser.
1139 * There seem some open questions; see the TODO comments!
1141 * Revision 1.20 2000/05/08 21:58:22 gerd
1142 * Introduced entity_manager as communication object between
1143 * the parser and the currently active entity.
1144 * New hooks handle_bof and handle_eof.
1145 * Removed "delegated entities". The entity manager contains
1146 * the stack of open entities.
1147 * Changed the way Begin_entity and End_entity are inserted.
1148 * This is now done by handle_bof and handle_eof.
1149 * The XML declaration is no longer detected by the entity.
1150 * This is now done by the parser.
1152 * Revision 1.19 2000/05/01 15:18:44 gerd
1153 * Improved CRLF handling in the replacement text of entities.
1154 * Changed one error message.
1156 * Revision 1.18 2000/04/30 18:18:39 gerd
1157 * Bugfixes: The conversion of CR and CRLF to LF is now hopefully
1158 * done right. The new variable "normalize_newline" indicates whether
1159 * normalization must happen for that type of entity. The normalization
1160 * if actually carried out separately for every token that needs it.
1162 * Revision 1.17 2000/03/13 23:42:38 gerd
1163 * Removed the resolver classes, and put them into their
1164 * own module (Markup_reader).
1166 * Revision 1.16 2000/02/22 01:06:58 gerd
1167 * Bugfix: Resolvers are properly re-initialized. This bug caused
1168 * that entities could not be referenced twice in the same document.
1170 * Revision 1.15 2000/01/20 20:54:11 gerd
1171 * New config.errors_with_line_numbers.
1173 * Revision 1.14 2000/01/08 18:59:03 gerd
1174 * Corrected the string resolver.
1176 * Revision 1.13 1999/09/01 22:58:23 gerd
1177 * Method warn_not_latin1 raises Illegal_character if the character
1178 * does not match the Char production.
1179 * External entities that are not document entities check if the
1180 * <?xml...?> declaration at the beginning matches the TextDecl production.
1181 * Method xml_declaration has type ... list option, not ... list.
1182 * Tag_beg and Tag_end now carry an entity_id with them.
1183 * The code to check empty entities has changed. That the Begin_entity/
1184 * End_entity pair is not to be added must be explicitly turned on. See the
1185 * description of empty entity handling in design.txt.
1186 * In internal subsets entity declarations are not allowed to refer
1187 * to parameter entities. The internal_entity class can do this now.
1188 * The p_parsed parameter of internal_entity has gone. It was simply
1191 * Revision 1.12 1999/09/01 16:24:13 gerd
1192 * The method replacement_text returns the text as described for
1193 * "included in literal". The former behaviour has been dropped to include
1194 * a leading and a trailing space character for parameter entities.
1195 * Bugfix: When general entities are included, they are always parsed.
1197 * Revision 1.11 1999/08/31 19:13:31 gerd
1198 * Added checks on proper PE nesting. The idea is that tokens such
1199 * as Decl_element and Decl_rangle carry an entity ID with them. This ID
1200 * is simply an object of type < >, i.e. you can only test on identity.
1201 * The lexer always produces tokens with a dummy ID because it does not
1202 * know which entity is the current one. The entity layer replaces the dummy
1203 * ID with the actual ID. The parser checks that the IDs of pairs such as
1204 * Decl_element and Decl_rangle are the same; otherwise a Validation_error
1207 * Revision 1.10 1999/08/19 01:06:41 gerd
1208 * Improved error messages: external entities print their
1211 * Revision 1.9 1999/08/15 20:35:48 gerd
1212 * Improved error messages.
1213 * Before the tokens Plus, Star, Qmark space is not allowed any longer.
1214 * Detection of recursive entity references is a bit cleaner.
1216 * Revision 1.8 1999/08/15 15:33:44 gerd
1217 * Revised whitespace checking: At certain positions there must be
1218 * white space. These checks cannot be part of the lexer, as %entity; counts
1219 * as white space. They cannot be part of the yacc parser because one look-ahead
1220 * token would not suffice if we did that. So these checks must be done by the
1221 * entity layer. Luckily, the rules are simple: There are simply a number of
1222 * token pairs between which white space must occur independently of where
1223 * these token have been found. Two variables, "space_seen", and "last_token"
1224 * have been added in order to check these rules.
1226 * Revision 1.7 1999/08/15 00:41:06 gerd
1227 * The [ token of conditional sections is now allowed to occur
1228 * in a different entity.
1230 * Revision 1.6 1999/08/15 00:29:02 gerd
1231 * The method "attlist_replacement_text" has gone. There is now a
1232 * more general "replacement_text" method that computes the replacement
1233 * text for both internal and external entities. Additionally, this method
1234 * returns whether references to external entities have been resolved;
1235 * this is checked in the cases where formerly "attlist_replacement_text"
1236 * was used as it is not allowed everywhere.
1237 * Entities have a new slot "need_spaces" that indicates that the
1238 * next token must be white space or a parameter reference. The problem
1239 * was that "<!ATTLIST%e;" is legal because when including parameter
1240 * entities white space is added implicitly. Formerly, the white space
1241 * was expected by the underlying lexer; now the lexer does not check
1242 * anymore that "<!ATTLIST" is followed by white space because the lexer
1243 * cannot handle parameter references. Because of this, the check on
1244 * white space must be done by the entity.
1246 * Revision 1.5 1999/08/14 22:57:19 gerd
1247 * It is allowed that external entities are empty because the
1248 * empty string is well-parsed for both declarations and contents. Empty
1249 * entities can be referenced anywhere because the references are replaced
1250 * by nothing. Because of this, the Begin_entity...End_entity brace is only
1251 * inserted if the entity is non-empty. (Otherwise references to empty
1252 * entities would not be allowed anywhere.)
1253 * As a consequence, the grammar has been changed such that a
1254 * single Eof is equivalent to Begin_entity,End_entity without content.
1256 * Revision 1.4 1999/08/14 22:11:19 gerd
1257 * Several objects have now a "warner" as argument which is
1258 * an object with a "warn" method. This is used to warn about characters
1259 * that cannot be represented in the Latin 1 alphabet.
1260 * Previously, the resolvers had features in order to warn about
1261 * such characters; this has been removed.
1262 * UTF-8 streams can be read even if they contain characters
1263 * that cannot be represented by 16 bits.
1264 * The buffering used in the resolvers is now solved in a
1265 * cleaner way; the number of characters that are expected to be read
1266 * from a source can be limited. This removes a bug with UTF-16 streams
1267 * that previously lead to wrong exceptions; and the buffering is more
1270 * Revision 1.3 1999/08/11 14:58:53 gerd
1271 * Some more names for encodings are allowed, such as "utf8" instead
1272 * of the standard name "UTF-8".
1273 * 'resolve_as_file' interprets relative file names as relative to
1274 * the "parent" resolver.
1276 * Revision 1.2 1999/08/10 21:35:07 gerd
1277 * The XML/encoding declaration at the beginning of entities is
1278 * evaluated. In particular, entities have now a method "xml_declaration"
1279 * which returns the name/value pairs of such a declaration. The "encoding"
1280 * setting is interpreted by the entity itself; "version", and "standalone"
1281 * are interpreted by Markup_yacc.parse_document_entity. Other settings
1282 * are ignored (this does not conform to the standard; the standard prescribes
1283 * that "version" MUST be given in the declaration of document; "standalone"
1284 * and "encoding" CAN be declared; no other settings are allowed).
1285 * TODO: The user should be warned if the standard is not exactly
1286 * fulfilled. -- The "standalone" property is not checked yet.
1288 * Revision 1.1 1999/08/10 00:35:51 gerd