X-Git-Url: http://matita.cs.unibo.it/gitweb/?a=blobdiff_plain;f=helm%2FDEVEL%2Fpxp%2Fpxp%2Fpxp_entity.ml;fp=helm%2FDEVEL%2Fpxp%2Fpxp%2Fpxp_entity.ml;h=94b21aefe15378cb9e56da2aff9c6739f3893ef0;hb=c03d2c1fdab8d228cb88aaba5ca0f556318bebc5;hp=0000000000000000000000000000000000000000;hpb=758057e85325f94cd88583feb1fdf6b038e35055;p=helm.git diff --git a/helm/DEVEL/pxp/pxp/pxp_entity.ml b/helm/DEVEL/pxp/pxp/pxp_entity.ml new file mode 100644 index 000000000..94b21aefe --- /dev/null +++ b/helm/DEVEL/pxp/pxp/pxp_entity.ml @@ -0,0 +1,1292 @@ +(* $Id$ + * ---------------------------------------------------------------------- + * PXP: The polymorphic XML parser for Objective Caml. + * Copyright by Gerd Stolpmann. See LICENSE for details. + *) + + +(* TODO: + * - Wie verhindert man, dass ein internal entity eine XML-Dekl. im + * replacement text akzeptiert? + *) + + +open Pxp_types +open Pxp_lexer_types +open Pxp_aux +open Pxp_reader + +(* Hierarchy of parsing layers: + * + * - Parser: Pxp_yacc + * + gets input stream from the main entity object + * + checks most of the grammar + * + creates the DTD object as side-effect + * + creates the element tree as side-effect + * + creates further entity objects that are entered into the DTD + * - Entity layer: Pxp_entity + * + gets input stream from the lexers, or another entity object + * + handles entity references: if a reference is encountered the + * input stream is redirected such that the tokens come from the + * referenced entity object + * + handles conditional sections + * - Lexer layer: Pxp_lexers + * + gets input from lexbuffers created by resolvers + * + different lexers for different lexical contexts + * + a lexer returns pairs (token,lexid), where token is the scanned + * token, and lexid is the name of the lexer that must be used for + * the next token + * - Resolver layer: Pxp_entity + * + a resolver creates the lexbuf from some character source + * + a resolver recodes the input and handles the encoding scheme + *) + +(**********************************************************************) + +(* Variables of type 'state' are used to insert Begin_entity and End_entity + * tokens into the stream. + * - At_beginning: Nothing has been read so far + * - First_token tok: A Begin_entity has been inserted; and the next token + * is 'tok' which is not Eof. (Begin_entity/End_entity must not be inserted + * if the entity is empty.) + * - In_stream: After the first token has been read, but befor Eof. + * - At_end: Eof has been read, and End_entity has been returned. + *) + +type state = + At_beginning + | Inserted_begin_entity + | At_end +;; + + +(**********************************************************************) + +class virtual entity the_dtd the_name the_warner + init_errors_with_line_numbers init_encoding = + object (self) + (* This class prescribes the type of all entity objects. Furthermore, + * the default 'next_token' mechanism is implemented. + *) + + (* 'init_errors_with_line_numbers': whether error messages contain line + * numbers or not. + * Calculating line numbers is expensive. + *) + + val mutable dtd = the_dtd + val mutable name = the_name + val mutable warner = the_warner + + val encoding = (init_encoding : rep_encoding) + val lexerset = Pxp_lexers.get_lexer_set init_encoding + + method encoding = encoding + (* method lexerset = lexerset *) + + val mutable manager = None + (* The current entity_manager, see below *) + + method private manager = + ( match manager with + None -> assert false + | Some m -> m + : < current_entity : entity; + pop_entity : unit; + push_entity : entity -> unit > + ) + + method set_manager m = manager <- Some m + + + val mutable lexbuf = Lexing.from_string "" + (* The lexical buffer currently used as character source. *) + + val mutable prolog = None + (* Stores the initial token as PI_xml *) + + val mutable prolog_pairs = [] + (* If prolog <> None, these are the (name,value) pairs of the + * processing instruction. + *) + + + val mutable lex_id = Document + (* The name of the lexer that should be used for the next token *) + + method set_lex_id id = lex_id <- lex_id + + + + val mutable force_parameter_entity_parsing = false + (* 'true' forces that inner entities will always be embraced by + * Begin_entity and End_entity. + * 'false': the inner entity itself decides this + *) + + val mutable check_text_declaration = true + (* 'true': It is checked that the declaration matches the + * production TextDecl. + *) + + val mutable normalize_newline = true + (* Whether this entity converts CRLF or CR to LF, or not *) + + + val mutable line = 1 (* current line *) + val mutable column = 0 (* current column *) + val mutable pos = 0 (* current absolute character position *) + val errors_with_line_numbers = init_errors_with_line_numbers + + val mutable p_line = 1 + val mutable p_column = 1 + + method line = p_line + method column = p_column + + + val mutable counts_as_external = false + + method counts_as_external = counts_as_external + (* Whether the entity counts as external (for the standalone check). *) + + method set_counts_as_external = + counts_as_external <- true + + + val mutable last_token = Bof + (* XXX + * These two variables are used to check that between certain pairs of + * tokens whitespaces exist. 'last_token' is simply the last token, + * but not Ignore, and not PERef (which both represent whitespace). + * 'space_seen' records whether Ignore or PERef was seen between this + * token and 'last_token'. + *) + + val mutable deferred_token = None + (* If you set this to Some tl, the next invocations of + * next_token_from_entity will return the tokens in tl. + * This makes it possible to insert tokens into the stream. + *) + + val mutable debug = false + + method is_ndata = false + (* Returns if this entity is an NDATA (unparsed) entity *) + + method name = name + + method virtual open_entity : bool -> lexers -> unit + (* open_entity force_parsing lexid: + * opens the entity, and the first token is scanned by the lexer + * 'lexid'. 'force_parsing' forces that Begin_entity and End_entity + * tokens embrace the inner tokens of the entity; otherwise this + * depends on the entity. + * By opening an entity, reading tokens from it, and finally closing + * the entity, the inclusion methods "Included", + * "Included if validating", and "Included as PE" can be carried out. + * Which method is chosen depends on the 'lexid', i.e. the lexical + * context: 'lexid = Content' performs "Included (if validating)" (we + * are always validating); 'lexid = Declaration' performs + * "Included as PE". The difference is which tokens are recognized, + * and how spaces are handled. + * 'force_parsing' causes that a Begin_entity token is inserted before + * and an End_entity token is inserted after the entity. The yacc + * rules allow the Begin_entity ... End_entity brace only at certain + * positions; this is used to restrict the possible positions where + * entities may be included, and to guarantee that the entity matches + * a certain production of the grammar ("parsed entities"). + * 'open_entity' is currently invoked with 'force_parsing = true' + * for toplevel nodes, for inclusion of internal general entities, + * and for inclusion of parameter entities into document entities. + * 'force_parsing = false' is used for all other cases: External + * entities add the Begin_entity/End_entity tokens anyway; internal + * entities do not. Especially internal parameter entities referenced + * from non-document entities do not add these tokens. + *) + + method virtual close_entity : lexers + (* close_entity: + * closes the entity and returns the name of the lexer that must + * be used to scan the next token. + *) + + method virtual replacement_text : (string * bool) + (* replacement_text: + * returns the replacement text of the entity, and as second value, + * whether the replacement text was constructed by referencing + * external entities (directly or indirectly). + * This method implements the inclusion method "Included in Literal". + *) + + + method lexbuf = lexbuf + + + method xml_declaration = + (* return the (name,value) pairs of the initial + * processing instruction. + *) + match prolog with + None -> + None + | Some p -> + Some prolog_pairs + + + method set_debugging_mode m = + debug <- m + + method private virtual set_encoding : string -> unit + + + method full_name = + name + + + method next_token = + (* read next token from this entity *) + + match deferred_token with + Some toklist -> + ( match toklist with + [] -> + deferred_token <- None; + self # next_token + | tok :: toklist' -> + deferred_token <- Some toklist'; + if debug then + prerr_endline ("- Entity " ^ name ^ ": " ^ string_of_tok tok ^ " (deferred)"); + tok + ) + | None -> begin + let this_line = line + and this_column = column in + let this_pos = pos in + p_line <- this_line; + p_column <- this_column; + (* Read the next token from the appropriate lexer lex_id, and get the + * name lex_id' of the next lexer to be used. + *) + let tok, lex_id' = + match lex_id with + Document -> lexerset.scan_document lexbuf + | Document_type -> lexerset.scan_document_type lexbuf + | Content -> lexerset.scan_content lexbuf + | Within_tag -> lexerset.scan_within_tag lexbuf + | Declaration -> lexerset.scan_declaration lexbuf + | Content_comment -> lexerset.scan_content_comment lexbuf + | Decl_comment -> lexerset.scan_decl_comment lexbuf + | Document_comment -> lexerset.scan_document_comment lexbuf + | Ignored_section -> assert false + (* Ignored_section: only used by method next_ignored_token *) + in + if debug then + prerr_endline ("- Entity " ^ name ^ ": " ^ string_of_tok tok); + (* Find out the number of lines and characters of the last line: *) + let n_lines, n_columns = + if errors_with_line_numbers then + count_lines (Lexing.lexeme lexbuf) + else + 0, (Lexing.lexeme_end lexbuf - Lexing.lexeme_start lexbuf) + in + line <- this_line + n_lines; + column <- if n_lines = 0 then this_column + n_columns else n_columns; + pos <- Lexing.lexeme_end lexbuf; + lex_id <- lex_id'; + (* Throw Ignore and Comment away; Interpret entity references: *) + (* NOTE: Of course, references to general entities are not allowed + * everywhere; parameter references, too. This is already done by the + * lexers, i.e. &name; and %name; are recognized only where they + * are allowed. + *) + + (* TODO: last_token is only used to detect Bof. Can be simplified *) + + let at_bof = (last_token = Bof) in + last_token <- tok; + + let tok' = + match tok with + + (* Entity references: *) + + | ERef n -> + let en, extdecl = dtd # gen_entity n in + if dtd # standalone_declaration && extdecl then + raise + (Validation_error + ("Reference to entity `" ^ n ^ + "' violates standalone declaration")); + en # set_debugging_mode debug; + en # open_entity true lex_id; + self # manager # push_entity en; + en # next_token; + | PERef n -> + let en = dtd # par_entity n in + en # set_debugging_mode debug; + en # open_entity force_parameter_entity_parsing lex_id; + self # manager # push_entity en; + en # next_token; + + (* Convert LineEnd to CharData *) + | LineEnd s -> + if normalize_newline then + CharData "\n" + else + CharData s + + (* Also normalize CDATA sections *) + | Cdata value as cd -> + if normalize_newline then + Cdata(normalize_line_separators lexerset value) + else + cd + + (* If there are CRLF sequences in a PI value, normalize them, too *) + | PI(name,value) as pi -> + if normalize_newline then + PI(name, normalize_line_separators lexerset value) + else + pi + + (* Attribute values: If they are already normalized, they are turned + * into Attval_nl_normalized. This is detected by other code. + *) + | Attval value as av -> + if normalize_newline then + av + else + Attval_nl_normalized value + + (* Another CRLF normalization case: Unparsed_string *) + | Unparsed_string value as ustr -> + if normalize_newline then + Unparsed_string(normalize_line_separators lexerset value) + else + ustr + + (* These tokens require that the entity_id parameter is set: *) + | Doctype _ -> Doctype (self :> entity_id) + | Doctype_rangle _ ->Doctype_rangle(self :> entity_id) + | Dtd_begin _ -> Dtd_begin (self :> entity_id) + | Dtd_end _ -> Dtd_end (self :> entity_id) + | Decl_element _ -> Decl_element (self :> entity_id) + | Decl_attlist _ -> Decl_attlist (self :> entity_id) + | Decl_entity _ -> Decl_entity (self :> entity_id) + | Decl_notation _ ->Decl_notation (self :> entity_id) + | Decl_rangle _ -> Decl_rangle (self :> entity_id) + | Lparen _ -> Lparen (self :> entity_id) + | Rparen _ -> Rparen (self :> entity_id) + | RparenPlus _ -> RparenPlus (self :> entity_id) + | RparenStar _ -> RparenStar (self :> entity_id) + | RparenQmark _ -> RparenQmark (self :> entity_id) + | Conditional_begin _ -> Conditional_begin (self :> entity_id) + | Conditional_body _ -> Conditional_body (self :> entity_id) + | Conditional_end _ -> Conditional_end (self :> entity_id) + | Tag_beg (n,_) -> Tag_beg (n, (self :> entity_id)) + | Tag_end (n,_) -> Tag_end (n, (self :> entity_id)) + + (* End of file: *) + + | Eof -> + if debug then begin + prerr_endline ("- Entity " ^ name ^ " # handle_eof"); + let tok = self # handle_eof in + prerr_endline ("- Entity " ^ name ^ " # handle_eof: returns " ^ string_of_tok tok); + tok + end + else + self # handle_eof; + + (* The default case. *) + + | _ -> + tok + + in + if at_bof & tok <> Eof + then begin + if debug then + prerr_endline ("- Entity " ^ name ^ " # handle_bof"); + self # handle_bof tok' + end + else + tok' + end + + + (* 'handle_bof' and 'handle_eof' can be used as hooks. Behaviour: + * + * - Normally, the first token t is read in, and 'handle_bof t' is + * called. The return value of this method is what is returned to + * the user. + * - If the EOF has been reached, 'handle_eof' is called. + * - BUT: If the first token is already EOF, 'handle_eof' is called + * ONLY, and 'handle_bof' is NOT called. + * + * The default implementations: + * - handle_bof: does nothing + * - handle_eof: Pops the previous entity from the stack, switches back + * to this entity, and returns the next token of this entity. + *) + + + method private handle_bof tok = + tok + + + method private handle_eof = + let mng = self # manager in + begin try + mng # pop_entity; + let next_lex_id = self # close_entity in + let en = mng # current_entity in + en # set_lex_id next_lex_id; + en # next_token + with + Stack.Empty -> + (* The outermost entity is at EOF *) + Eof + end + + + method next_ignored_token = + (* used after Conditional_begin (self :> entity_id) + | Conditional_end _ -> Conditional_end (self :> entity_id) + | _ -> tok + + + method process_xmldecl pl = + (* The parser calls this method just after the XML declaration + * has been detected. + * 'pl': This is the argument of the PI_xml token. + *) + if debug then + prerr_endline ("- Entity " ^ name ^ " # process_xmldecl"); + prolog <- Some pl; + prolog_pairs <- decode_xml_pi pl; + if check_text_declaration then + check_text_xml_pi prolog_pairs; + begin + try + let e = List.assoc "encoding" prolog_pairs in + self # set_encoding e + with + Not_found -> + self # set_encoding "" + end; + + + method process_missing_xmldecl = + (* The parser calls this method if the XML declaration is missing *) + if debug then + prerr_endline ("- Entity " ^ name ^ " # process_missing_xmldecl"); + self # set_encoding "" + + + (* Methods for NDATA entities only: *) + method ext_id = (assert false : ext_id) + method notation = (assert false : string) + + end +;; + + +class ndata_entity the_name the_ext_id the_notation init_encoding = + object (self) + (* An NDATA entity is very restricted; more or less you can only find out + * its external ID and its notation. + *) + + val mutable name = the_name + val mutable ext_id = the_ext_id + val mutable notation = the_notation + val encoding = (init_encoding : rep_encoding) + + method name = (name : string) + method ext_id = (ext_id : ext_id) + method notation = (notation : string) + + method is_ndata = true + + method encoding = encoding + + + val mutable counts_as_external = false + + method counts_as_external = counts_as_external + (* Whether the entity counts as external (for the standalone check). *) + + method set_counts_as_external = + counts_as_external <- true + + + method set_manager (m : < current_entity : entity; + pop_entity : unit; + push_entity : entity -> unit >) = + ( raise (Validation_error ("Invalid reference to NDATA entity " ^ name)) + : unit ) + + method set_lex_id (id : lexers) = + ( raise (Validation_error ("Invalid reference to NDATA entity " ^ name)) + : unit ) + + method line = + ( raise (Validation_error ("Invalid reference to NDATA entity " ^ name)) + : int ) + + method column = + ( raise (Validation_error ("Invalid reference to NDATA entity " ^ name)) + : int ) + + method full_name = + ( raise (Validation_error ("Invalid reference to NDATA entity " ^ name)) + : string ) + + method private set_encoding (_:string) = + assert false + + method xml_declaration = (None : (string*string) list option) + + method set_debugging_mode (_:bool) = () + + method open_entity (_:bool) (_:lexers) = + ( raise (Validation_error ("Invalid reference to NDATA entity " ^ name)) + : unit ) + + method close_entity = + ( raise (Validation_error ("Invalid reference to NDATA entity " ^ name)) + : lexers ) + + method replacement_text = + ( raise (Validation_error ("Invalid reference to NDATA entity " ^ name)) + : (string * bool) ) + + method lexbuf = + ( raise (Validation_error ("Invalid reference to NDATA entity " ^ name)) + : Lexing.lexbuf ) + + method next_token = + ( raise (Validation_error ("Invalid reference to NDATA entity " ^ name)) + : token ) + + method next_ignored_token = + ( raise (Validation_error ("Invalid reference to NDATA entity " ^ name)) + : token ) + + method process_xmldecl (pl:prolog_token list) = + ( raise (Validation_error ("Invalid reference to NDATA entity " ^ name)) + : unit ) + + method process_missing_xmldecl = + ( raise (Validation_error ("Invalid reference to NDATA entity " ^ name)) + : unit ) + + end +;; + + +class external_entity the_resolver the_dtd the_name the_warner the_ext_id + the_p_special_empty_entities + init_errors_with_line_numbers + init_encoding + = + object (self) + inherit entity + the_dtd the_name the_warner init_errors_with_line_numbers + init_encoding + as super + + (* An external entity gets the lexbuf that is used as character source + * from a resolver. + * Furthermore, before the first token an Begin_entity is inserted, and + * before Eof an End_entity token is inserted into the stream. This done + * always regardless of the argument 'force_parsing' of the method + * 'open_entity'. + * + * 'the_p_internal_subset': see class internal_entity + * 'the_p_special_empty_entities': if true, a Begin_entity/End_entity + * brace is left out if the entity is otherwise empty. + *) + + val resolver = (the_resolver : resolver) + val ext_id = (the_ext_id : ext_id) + + val p_special_empty_entities = (the_p_special_empty_entities : bool) + + val mutable resolver_is_open = false + (* Track if the resolver is open. This is also used to find recursive + * references of entities. + *) + + val mutable state = At_beginning + + initializer + counts_as_external <- true; + + + method private set_encoding e = + assert resolver_is_open; + resolver # change_encoding e + + + method full_name = + name ^ + match ext_id with + System s -> " = SYSTEM \"" ^ s ^ "\"" + | Public(p,s) -> " = PUBLIC \"" ^ p ^ "\" \"" ^ s ^ "\"" + | Anonymous -> " = ANONYMOUS" + + + method open_entity force_parsing init_lex_id = + (* Note that external entities are always parsed, i.e. Begin_entity + * and End_entity tokens embrace the inner tokens to force that + * the entity is only called where the syntax allows it. + *) + if resolver_is_open then + raise(Validation_error("Recursive reference to entity `" ^ name ^ "'")); + let lex = + try + resolver # open_in ext_id + with + Pxp_reader.Not_competent -> + raise(Error ("No input method available for this external entity: " ^ + self # full_name)) + | Pxp_reader.Not_resolvable Not_found -> + raise(Error ("Unable to open the external entity: " ^ + self # full_name)) + | Pxp_reader.Not_resolvable e -> + raise(Error ("Unable to open the external entity: " ^ + self # full_name ^ "; reason: " ^ + string_of_exn e)) + in + resolver_is_open <- true; + lexbuf <- lex; + prolog <- None; + lex_id <- init_lex_id; + state <- At_beginning; + line <- 1; + column <- 0; + pos <- 0; + last_token <- Bof; + normalize_newline <- true; + + + method private handle_bof tok = + (* This hook is only called if the stream is not empty. *) + deferred_token <- Some [ tok ]; + state <- Inserted_begin_entity; + Begin_entity + + + method private handle_eof = + (* This hook is called if the end of the stream is reached *) + match state with + At_beginning -> + (* This is only possible if the stream is empty. *) + if p_special_empty_entities then begin + (* Continue immediately with the next token *) + state <- At_end; + super # handle_eof + end + else begin + (* Insert Begin_entity / End_entity *) + deferred_token <- Some [ End_entity ]; + state <- At_end; + Begin_entity; + (* After these two token have been processed, the lexer + * is called again, and it will return another Eof. + *) + end + | Inserted_begin_entity -> + (* Insert End_entity, too. *) + state <- At_end; + End_entity; + | At_end -> + (* Continue with the next token: *) + super # handle_eof + + + method close_entity = + if not resolver_is_open then + failwith ("External entity " ^ name ^ " not open"); + resolver # close_in; + resolver_is_open <- false; + lex_id + + + method replacement_text = + (* Return the replacement text of the entity. The method used for this + * is more or less the same as for internal entities; i.e. character + * and parameter entities are resolved immediately. In addition to that, + * external entities may begin with an "xml" processing instruction + * which is considered not to be part of the replacement text. + *) + if resolver_is_open then + raise(Validation_error("Recursive reference to entity `" ^ name ^ "'")); + let lex = resolver # open_in ext_id in + resolver_is_open <- true; + lexbuf <- lex; + prolog <- None; + (* arbitrary: lex_id <- init_lex_id; *) + state <- At_beginning; + line <- 1; + column <- 0; + pos <- 0; + last_token <- Bof; + (* First check if the first token of 'lex' is *) + begin match lexerset.scan_only_xml_decl lex with + PI_xml pl -> + self # process_xmldecl pl + | Eof -> + (* This only means that the first token was not ; + * the "Eof" token represents the empty string. + *) + self # process_missing_xmldecl + | _ -> + (* Must not happen. *) + assert false + end; + (* Then create the replacement text. *) + let rec scan_and_expand () = + match lexerset.scan_dtd_string lexbuf with + ERef n -> "&" ^ n ^ ";" ^ scan_and_expand() + | CRef(-1) -> "\n" ^ scan_and_expand() + | CRef(-2) -> "\n" ^ scan_and_expand() + | CRef(-3) -> "\n" ^ scan_and_expand() + | CRef k -> character encoding warner k ^ scan_and_expand() + | CharData x -> x ^ scan_and_expand() + | PERef n -> + let en = dtd # par_entity n in + let (x,_) = en # replacement_text in + x ^ scan_and_expand() + | Eof -> + "" + | _ -> + assert false + in + let rtext = scan_and_expand() in + resolver # close_in; + resolver_is_open <- false; + rtext, true + (* TODO: + * - The replaced text is not parsed [VALIDATION WEAKNESS] + *) + end +;; + + +class document_entity the_resolver the_dtd the_name the_warner the_ext_id + init_errors_with_line_numbers + init_encoding + = + object (self) + inherit external_entity the_resolver the_dtd the_name the_warner + the_ext_id false init_errors_with_line_numbers + init_encoding + + (* A document entity is an external entity that does not allow + * conditional sections, and that forces that internal parameter entities + * are properly nested. + *) + + initializer + force_parameter_entity_parsing <- true; + check_text_declaration <- false; + + method counts_as_external = false + (* Document entities count never as external! *) + end +;; + + +class internal_entity the_dtd the_name the_warner the_literal_value + the_p_internal_subset init_errors_with_line_numbers + init_is_parameter_entity + init_encoding + = + (* An internal entity uses a "literal entity value" as character source. + * This value is first expanded and preprocessed, i.e. character and + * parameter references are expanded. + * + * 'the_p_internal_subset': indicates that the entity is declared in the + * internal subset. Such entity declarations are not allowed to contain + * references to parameter entities. + * 'init_is_parameter_entity': whether this is a parameter entity or not + *) + + object (self) + inherit entity + the_dtd the_name the_warner init_errors_with_line_numbers + init_encoding + as super + + val p_internal_subset = the_p_internal_subset + + val mutable replacement_text = "" + val mutable contains_external_references = false + val mutable p_parsed_actually = false + val mutable is_open = false + val mutable state = At_beginning + val mutable is_parameter_entity = init_is_parameter_entity + + + initializer + let lexbuf = Lexing.from_string the_literal_value in + let rec scan_and_expand () = + match lexerset.scan_dtd_string lexbuf with + ERef n -> "&" ^ n ^ ";" ^ scan_and_expand() + | CRef(-1) -> "\r\n" ^ scan_and_expand() + | CRef(-2) -> "\r" ^ scan_and_expand() + | CRef(-3) -> "\n" ^ scan_and_expand() + | CRef k -> character encoding warner k ^ scan_and_expand() + | CharData x -> x ^ scan_and_expand() + | PERef n -> + if p_internal_subset then + raise(WF_error("Restriction of the internal subset: parameter entity not allowed here")); + let en = dtd # par_entity n in + let (x, extref) = en # replacement_text in + contains_external_references <- + contains_external_references or extref; + x ^ scan_and_expand() + | Eof -> + "" + | _ -> + assert false + in + is_open <- true; + replacement_text <- scan_and_expand(); + is_open <- false; + normalize_newline <- false; + counts_as_external <- false; + + + method process_xmldecl (pl:prolog_token list) = + raise(Validation_error("The encoding cannot be changed in internal entities")) + + + method process_missing_xmldecl = + () + + + method private set_encoding e = + (* Ignored if e = "" *) + assert(e = ""); + + + method open_entity force_parsing init_lex_id = + if is_open then + raise(Validation_error("Recursive reference to entity `" ^ name ^ "'")); + + p_parsed_actually <- force_parsing; + lexbuf <- Lexing.from_string + (if is_parameter_entity then + (" " ^ replacement_text ^ " ") + else + replacement_text); + prolog <- None; + lex_id <- init_lex_id; + state <- At_beginning; + is_open <- true; + line <- 1; + column <- 0; + pos <- 0; + last_token <- Eof; + + + method private handle_bof tok = + (* This hook is only called if the stream is not empty. *) + if p_parsed_actually then begin + deferred_token <- Some [ tok ]; + state <- Inserted_begin_entity; + Begin_entity + end + else begin + state <- At_end; + tok + end + + + method private handle_eof = + (* This hook is called if the end of the stream is reached *) + match state with + At_beginning -> + (* This is only possible if the stream is empty. *) + if p_parsed_actually then begin + (* Insert Begin_entity / End_entity *) + deferred_token <- Some [ End_entity ]; + state <- At_end; + Begin_entity; + (* After these two token have been processed, the lexer + * is called again, and it will return another Eof. + *) + end + else begin + (* Continue immediately with the next token *) + state <- At_end; + super # handle_eof + end + | Inserted_begin_entity -> + (* Insert End_entity, too. *) + state <- At_end; + End_entity; + | At_end -> + (* Continue with the next token: *) + super # handle_eof + + + method close_entity = + if not is_open then + failwith ("Internal entity " ^ name ^ " not open"); + is_open <- false; + lex_id + + + method replacement_text = + if is_open then + raise(Validation_error("Recursive reference to entity `" ^ name ^ "'")); + replacement_text, contains_external_references + end +;; + +(**********************************************************************) + +(* An 'entity_manager' is a stack of entities, where the topmost entity + * is the currently active entity, the second entity is the entity that + * referred to the active entity, and so on. + * + * The entity_manager can communicate with the currently active entity. + * + * The entity_manager provides an interface for the parser; the functions + * returning the current token and the next token are exported. + *) + +class entity_manager (init_entity : entity) = + object (self) + val mutable entity_stack = Stack.create() + val mutable current_entity = init_entity + val mutable current_entity's_full_name = lazy (init_entity # full_name) + + val mutable yy_get_next_ref = ref (fun () -> assert false) + + initializer + init_entity # set_manager (self :> + < current_entity : entity; + pop_entity : unit; + push_entity : entity -> unit > + ); + yy_get_next_ref := (fun () -> init_entity # next_token) + + method push_entity e = + e # set_manager (self :> + < current_entity : entity; + pop_entity : unit; + push_entity : entity -> unit > + ); + Stack.push (current_entity, current_entity's_full_name) entity_stack; + current_entity <- e; + current_entity's_full_name <- lazy (e # full_name); + yy_get_next_ref := (fun () -> e # next_token); + + method pop_entity = + (* May raise Stack.Empty *) + let e, e_name = Stack.pop entity_stack in + current_entity <- e; + current_entity's_full_name <- e_name; + yy_get_next_ref := (fun () -> e # next_token); + + + + method position_string = + (* Gets a string describing the position of the last token; + * includes an entity backtrace + *) + let b = Buffer.create 200 in + Buffer.add_string b + ("In entity " ^ current_entity # full_name + ^ ", at line " ^ string_of_int (current_entity # line) + ^ ", position " ^ string_of_int (current_entity # column) + ^ ":\n"); + Stack.iter + (fun (e, e_name) -> + Buffer.add_string b + ("Called from entity " ^ Lazy.force e_name + ^ ", line " ^ string_of_int (e # line) + ^ ", position " ^ string_of_int (e # column) + ^ ":\n"); + ) + entity_stack; + Buffer.contents b + + + method position = + (* Returns the triple (full_name, line, column) of the last token *) + Lazy.force current_entity's_full_name, + current_entity # line, + current_entity # column + + + method current_entity_counts_as_external = + (* Whether the current entity counts as external to the main + * document for the purpose of stand-alone checks. + *) + (* TODO: improve performance *) + let is_external = ref false in + let check (e, _) = + if e # counts_as_external then begin + is_external := true; + end; + in + check (current_entity,()); + Stack.iter check entity_stack; + !is_external + + + method current_entity = current_entity + + method yy_get_next_ref = yy_get_next_ref + + end +;; + + + +(* ====================================================================== + * History: + * + * $Log$ + * Revision 1.1 2000/11/17 09:57:29 lpadovan + * Initial revision + * + * Revision 1.6 2000/07/14 13:55:00 gerd + * Cosmetic changes. + * + * Revision 1.5 2000/07/09 17:51:50 gerd + * Entities return now the beginning of a token as its + * position. + * New method 'position' for entity_manager. + * + * Revision 1.4 2000/07/09 01:05:04 gerd + * Exported methods 'ext_id' and 'notation' anyway. + * + * Revision 1.3 2000/07/08 16:28:05 gerd + * Updated: Exception 'Not_resolvable' is taken into account. + * + * Revision 1.2 2000/07/04 22:12:47 gerd + * Update: Case ext_id = Anonymous. + * Update: Handling of the exception Not_competent when reading + * from a resolver. + * + * Revision 1.1 2000/05/29 23:48:38 gerd + * Changed module names: + * Markup_aux into Pxp_aux + * Markup_codewriter into Pxp_codewriter + * Markup_document into Pxp_document + * Markup_dtd into Pxp_dtd + * Markup_entity into Pxp_entity + * Markup_lexer_types into Pxp_lexer_types + * Markup_reader into Pxp_reader + * Markup_types into Pxp_types + * Markup_yacc into Pxp_yacc + * See directory "compatibility" for (almost) compatible wrappers emulating + * Markup_document, Markup_dtd, Markup_reader, Markup_types, and Markup_yacc. + * + * ====================================================================== + * Old logs from markup_entity.ml: + * + * Revision 1.27 2000/05/29 21:14:57 gerd + * Changed the type 'encoding' into a polymorphic variant. + * + * Revision 1.26 2000/05/28 17:24:55 gerd + * Bugfixes. + * + * Revision 1.25 2000/05/27 19:23:32 gerd + * The entities store whether they count as external with + * respect to the standalone check: New methods counts_as_external + * and set_counts_as_external. + * The entity manager can find out whether the current + * entity counts as external: method current_entity_counts_as_external. + * + * Revision 1.24 2000/05/20 20:31:40 gerd + * Big change: Added support for various encodings of the + * internal representation. + * + * Revision 1.23 2000/05/14 21:51:24 gerd + * Change: Whitespace is handled by the grammar, and no longer + * by the entity. + * + * Revision 1.22 2000/05/14 17:50:54 gerd + * Updates because of changes in the token type. + * + * Revision 1.21 2000/05/09 00:02:44 gerd + * Conditional sections are now recognized by the parser. + * There seem some open questions; see the TODO comments! + * + * Revision 1.20 2000/05/08 21:58:22 gerd + * Introduced entity_manager as communication object between + * the parser and the currently active entity. + * New hooks handle_bof and handle_eof. + * Removed "delegated entities". The entity manager contains + * the stack of open entities. + * Changed the way Begin_entity and End_entity are inserted. + * This is now done by handle_bof and handle_eof. + * The XML declaration is no longer detected by the entity. + * This is now done by the parser. + * + * Revision 1.19 2000/05/01 15:18:44 gerd + * Improved CRLF handling in the replacement text of entities. + * Changed one error message. + * + * Revision 1.18 2000/04/30 18:18:39 gerd + * Bugfixes: The conversion of CR and CRLF to LF is now hopefully + * done right. The new variable "normalize_newline" indicates whether + * normalization must happen for that type of entity. The normalization + * if actually carried out separately for every token that needs it. + * + * Revision 1.17 2000/03/13 23:42:38 gerd + * Removed the resolver classes, and put them into their + * own module (Markup_reader). + * + * Revision 1.16 2000/02/22 01:06:58 gerd + * Bugfix: Resolvers are properly re-initialized. This bug caused + * that entities could not be referenced twice in the same document. + * + * Revision 1.15 2000/01/20 20:54:11 gerd + * New config.errors_with_line_numbers. + * + * Revision 1.14 2000/01/08 18:59:03 gerd + * Corrected the string resolver. + * + * Revision 1.13 1999/09/01 22:58:23 gerd + * Method warn_not_latin1 raises Illegal_character if the character + * does not match the Char production. + * External entities that are not document entities check if the + * declaration at the beginning matches the TextDecl production. + * Method xml_declaration has type ... list option, not ... list. + * Tag_beg and Tag_end now carry an entity_id with them. + * The code to check empty entities has changed. That the Begin_entity/ + * End_entity pair is not to be added must be explicitly turned on. See the + * description of empty entity handling in design.txt. + * In internal subsets entity declarations are not allowed to refer + * to parameter entities. The internal_entity class can do this now. + * The p_parsed parameter of internal_entity has gone. It was simply + * superflous. + * + * Revision 1.12 1999/09/01 16:24:13 gerd + * The method replacement_text returns the text as described for + * "included in literal". The former behaviour has been dropped to include + * a leading and a trailing space character for parameter entities. + * Bugfix: When general entities are included, they are always parsed. + * + * Revision 1.11 1999/08/31 19:13:31 gerd + * Added checks on proper PE nesting. The idea is that tokens such + * as Decl_element and Decl_rangle carry an entity ID with them. This ID + * is simply an object of type < >, i.e. you can only test on identity. + * The lexer always produces tokens with a dummy ID because it does not + * know which entity is the current one. The entity layer replaces the dummy + * ID with the actual ID. The parser checks that the IDs of pairs such as + * Decl_element and Decl_rangle are the same; otherwise a Validation_error + * is produced. + * + * Revision 1.10 1999/08/19 01:06:41 gerd + * Improved error messages: external entities print their + * ext id, too + * + * Revision 1.9 1999/08/15 20:35:48 gerd + * Improved error messages. + * Before the tokens Plus, Star, Qmark space is not allowed any longer. + * Detection of recursive entity references is a bit cleaner. + * + * Revision 1.8 1999/08/15 15:33:44 gerd + * Revised whitespace checking: At certain positions there must be + * white space. These checks cannot be part of the lexer, as %entity; counts + * as white space. They cannot be part of the yacc parser because one look-ahead + * token would not suffice if we did that. So these checks must be done by the + * entity layer. Luckily, the rules are simple: There are simply a number of + * token pairs between which white space must occur independently of where + * these token have been found. Two variables, "space_seen", and "last_token" + * have been added in order to check these rules. + * + * Revision 1.7 1999/08/15 00:41:06 gerd + * The [ token of conditional sections is now allowed to occur + * in a different entity. + * + * Revision 1.6 1999/08/15 00:29:02 gerd + * The method "attlist_replacement_text" has gone. There is now a + * more general "replacement_text" method that computes the replacement + * text for both internal and external entities. Additionally, this method + * returns whether references to external entities have been resolved; + * this is checked in the cases where formerly "attlist_replacement_text" + * was used as it is not allowed everywhere. + * Entities have a new slot "need_spaces" that indicates that the + * next token must be white space or a parameter reference. The problem + * was that "