X-Git-Url: http://matita.cs.unibo.it/gitweb/?a=blobdiff_plain;f=helm%2FDEVEL%2Fpxp%2Fpxp%2Fpxp_entity.ml;fp=helm%2FDEVEL%2Fpxp%2Fpxp%2Fpxp_entity.ml;h=0000000000000000000000000000000000000000;hb=c7514aaa249a96c5fdd39b1123fbdb38d92f20b6;hp=94b21aefe15378cb9e56da2aff9c6739f3893ef0;hpb=1c7fb836e2af4f2f3d18afd0396701f2094265ff;p=helm.git diff --git a/helm/DEVEL/pxp/pxp/pxp_entity.ml b/helm/DEVEL/pxp/pxp/pxp_entity.ml deleted file mode 100644 index 94b21aefe..000000000 --- a/helm/DEVEL/pxp/pxp/pxp_entity.ml +++ /dev/null @@ -1,1292 +0,0 @@ -(* $Id$ - * ---------------------------------------------------------------------- - * PXP: The polymorphic XML parser for Objective Caml. - * Copyright by Gerd Stolpmann. See LICENSE for details. - *) - - -(* TODO: - * - Wie verhindert man, dass ein internal entity eine XML-Dekl. im - * replacement text akzeptiert? - *) - - -open Pxp_types -open Pxp_lexer_types -open Pxp_aux -open Pxp_reader - -(* Hierarchy of parsing layers: - * - * - Parser: Pxp_yacc - * + gets input stream from the main entity object - * + checks most of the grammar - * + creates the DTD object as side-effect - * + creates the element tree as side-effect - * + creates further entity objects that are entered into the DTD - * - Entity layer: Pxp_entity - * + gets input stream from the lexers, or another entity object - * + handles entity references: if a reference is encountered the - * input stream is redirected such that the tokens come from the - * referenced entity object - * + handles conditional sections - * - Lexer layer: Pxp_lexers - * + gets input from lexbuffers created by resolvers - * + different lexers for different lexical contexts - * + a lexer returns pairs (token,lexid), where token is the scanned - * token, and lexid is the name of the lexer that must be used for - * the next token - * - Resolver layer: Pxp_entity - * + a resolver creates the lexbuf from some character source - * + a resolver recodes the input and handles the encoding scheme - *) - -(**********************************************************************) - -(* Variables of type 'state' are used to insert Begin_entity and End_entity - * tokens into the stream. - * - At_beginning: Nothing has been read so far - * - First_token tok: A Begin_entity has been inserted; and the next token - * is 'tok' which is not Eof. (Begin_entity/End_entity must not be inserted - * if the entity is empty.) - * - In_stream: After the first token has been read, but befor Eof. - * - At_end: Eof has been read, and End_entity has been returned. - *) - -type state = - At_beginning - | Inserted_begin_entity - | At_end -;; - - -(**********************************************************************) - -class virtual entity the_dtd the_name the_warner - init_errors_with_line_numbers init_encoding = - object (self) - (* This class prescribes the type of all entity objects. Furthermore, - * the default 'next_token' mechanism is implemented. - *) - - (* 'init_errors_with_line_numbers': whether error messages contain line - * numbers or not. - * Calculating line numbers is expensive. - *) - - val mutable dtd = the_dtd - val mutable name = the_name - val mutable warner = the_warner - - val encoding = (init_encoding : rep_encoding) - val lexerset = Pxp_lexers.get_lexer_set init_encoding - - method encoding = encoding - (* method lexerset = lexerset *) - - val mutable manager = None - (* The current entity_manager, see below *) - - method private manager = - ( match manager with - None -> assert false - | Some m -> m - : < current_entity : entity; - pop_entity : unit; - push_entity : entity -> unit > - ) - - method set_manager m = manager <- Some m - - - val mutable lexbuf = Lexing.from_string "" - (* The lexical buffer currently used as character source. *) - - val mutable prolog = None - (* Stores the initial token as PI_xml *) - - val mutable prolog_pairs = [] - (* If prolog <> None, these are the (name,value) pairs of the - * processing instruction. - *) - - - val mutable lex_id = Document - (* The name of the lexer that should be used for the next token *) - - method set_lex_id id = lex_id <- lex_id - - - - val mutable force_parameter_entity_parsing = false - (* 'true' forces that inner entities will always be embraced by - * Begin_entity and End_entity. - * 'false': the inner entity itself decides this - *) - - val mutable check_text_declaration = true - (* 'true': It is checked that the declaration matches the - * production TextDecl. - *) - - val mutable normalize_newline = true - (* Whether this entity converts CRLF or CR to LF, or not *) - - - val mutable line = 1 (* current line *) - val mutable column = 0 (* current column *) - val mutable pos = 0 (* current absolute character position *) - val errors_with_line_numbers = init_errors_with_line_numbers - - val mutable p_line = 1 - val mutable p_column = 1 - - method line = p_line - method column = p_column - - - val mutable counts_as_external = false - - method counts_as_external = counts_as_external - (* Whether the entity counts as external (for the standalone check). *) - - method set_counts_as_external = - counts_as_external <- true - - - val mutable last_token = Bof - (* XXX - * These two variables are used to check that between certain pairs of - * tokens whitespaces exist. 'last_token' is simply the last token, - * but not Ignore, and not PERef (which both represent whitespace). - * 'space_seen' records whether Ignore or PERef was seen between this - * token and 'last_token'. - *) - - val mutable deferred_token = None - (* If you set this to Some tl, the next invocations of - * next_token_from_entity will return the tokens in tl. - * This makes it possible to insert tokens into the stream. - *) - - val mutable debug = false - - method is_ndata = false - (* Returns if this entity is an NDATA (unparsed) entity *) - - method name = name - - method virtual open_entity : bool -> lexers -> unit - (* open_entity force_parsing lexid: - * opens the entity, and the first token is scanned by the lexer - * 'lexid'. 'force_parsing' forces that Begin_entity and End_entity - * tokens embrace the inner tokens of the entity; otherwise this - * depends on the entity. - * By opening an entity, reading tokens from it, and finally closing - * the entity, the inclusion methods "Included", - * "Included if validating", and "Included as PE" can be carried out. - * Which method is chosen depends on the 'lexid', i.e. the lexical - * context: 'lexid = Content' performs "Included (if validating)" (we - * are always validating); 'lexid = Declaration' performs - * "Included as PE". The difference is which tokens are recognized, - * and how spaces are handled. - * 'force_parsing' causes that a Begin_entity token is inserted before - * and an End_entity token is inserted after the entity. The yacc - * rules allow the Begin_entity ... End_entity brace only at certain - * positions; this is used to restrict the possible positions where - * entities may be included, and to guarantee that the entity matches - * a certain production of the grammar ("parsed entities"). - * 'open_entity' is currently invoked with 'force_parsing = true' - * for toplevel nodes, for inclusion of internal general entities, - * and for inclusion of parameter entities into document entities. - * 'force_parsing = false' is used for all other cases: External - * entities add the Begin_entity/End_entity tokens anyway; internal - * entities do not. Especially internal parameter entities referenced - * from non-document entities do not add these tokens. - *) - - method virtual close_entity : lexers - (* close_entity: - * closes the entity and returns the name of the lexer that must - * be used to scan the next token. - *) - - method virtual replacement_text : (string * bool) - (* replacement_text: - * returns the replacement text of the entity, and as second value, - * whether the replacement text was constructed by referencing - * external entities (directly or indirectly). - * This method implements the inclusion method "Included in Literal". - *) - - - method lexbuf = lexbuf - - - method xml_declaration = - (* return the (name,value) pairs of the initial - * processing instruction. - *) - match prolog with - None -> - None - | Some p -> - Some prolog_pairs - - - method set_debugging_mode m = - debug <- m - - method private virtual set_encoding : string -> unit - - - method full_name = - name - - - method next_token = - (* read next token from this entity *) - - match deferred_token with - Some toklist -> - ( match toklist with - [] -> - deferred_token <- None; - self # next_token - | tok :: toklist' -> - deferred_token <- Some toklist'; - if debug then - prerr_endline ("- Entity " ^ name ^ ": " ^ string_of_tok tok ^ " (deferred)"); - tok - ) - | None -> begin - let this_line = line - and this_column = column in - let this_pos = pos in - p_line <- this_line; - p_column <- this_column; - (* Read the next token from the appropriate lexer lex_id, and get the - * name lex_id' of the next lexer to be used. - *) - let tok, lex_id' = - match lex_id with - Document -> lexerset.scan_document lexbuf - | Document_type -> lexerset.scan_document_type lexbuf - | Content -> lexerset.scan_content lexbuf - | Within_tag -> lexerset.scan_within_tag lexbuf - | Declaration -> lexerset.scan_declaration lexbuf - | Content_comment -> lexerset.scan_content_comment lexbuf - | Decl_comment -> lexerset.scan_decl_comment lexbuf - | Document_comment -> lexerset.scan_document_comment lexbuf - | Ignored_section -> assert false - (* Ignored_section: only used by method next_ignored_token *) - in - if debug then - prerr_endline ("- Entity " ^ name ^ ": " ^ string_of_tok tok); - (* Find out the number of lines and characters of the last line: *) - let n_lines, n_columns = - if errors_with_line_numbers then - count_lines (Lexing.lexeme lexbuf) - else - 0, (Lexing.lexeme_end lexbuf - Lexing.lexeme_start lexbuf) - in - line <- this_line + n_lines; - column <- if n_lines = 0 then this_column + n_columns else n_columns; - pos <- Lexing.lexeme_end lexbuf; - lex_id <- lex_id'; - (* Throw Ignore and Comment away; Interpret entity references: *) - (* NOTE: Of course, references to general entities are not allowed - * everywhere; parameter references, too. This is already done by the - * lexers, i.e. &name; and %name; are recognized only where they - * are allowed. - *) - - (* TODO: last_token is only used to detect Bof. Can be simplified *) - - let at_bof = (last_token = Bof) in - last_token <- tok; - - let tok' = - match tok with - - (* Entity references: *) - - | ERef n -> - let en, extdecl = dtd # gen_entity n in - if dtd # standalone_declaration && extdecl then - raise - (Validation_error - ("Reference to entity `" ^ n ^ - "' violates standalone declaration")); - en # set_debugging_mode debug; - en # open_entity true lex_id; - self # manager # push_entity en; - en # next_token; - | PERef n -> - let en = dtd # par_entity n in - en # set_debugging_mode debug; - en # open_entity force_parameter_entity_parsing lex_id; - self # manager # push_entity en; - en # next_token; - - (* Convert LineEnd to CharData *) - | LineEnd s -> - if normalize_newline then - CharData "\n" - else - CharData s - - (* Also normalize CDATA sections *) - | Cdata value as cd -> - if normalize_newline then - Cdata(normalize_line_separators lexerset value) - else - cd - - (* If there are CRLF sequences in a PI value, normalize them, too *) - | PI(name,value) as pi -> - if normalize_newline then - PI(name, normalize_line_separators lexerset value) - else - pi - - (* Attribute values: If they are already normalized, they are turned - * into Attval_nl_normalized. This is detected by other code. - *) - | Attval value as av -> - if normalize_newline then - av - else - Attval_nl_normalized value - - (* Another CRLF normalization case: Unparsed_string *) - | Unparsed_string value as ustr -> - if normalize_newline then - Unparsed_string(normalize_line_separators lexerset value) - else - ustr - - (* These tokens require that the entity_id parameter is set: *) - | Doctype _ -> Doctype (self :> entity_id) - | Doctype_rangle _ ->Doctype_rangle(self :> entity_id) - | Dtd_begin _ -> Dtd_begin (self :> entity_id) - | Dtd_end _ -> Dtd_end (self :> entity_id) - | Decl_element _ -> Decl_element (self :> entity_id) - | Decl_attlist _ -> Decl_attlist (self :> entity_id) - | Decl_entity _ -> Decl_entity (self :> entity_id) - | Decl_notation _ ->Decl_notation (self :> entity_id) - | Decl_rangle _ -> Decl_rangle (self :> entity_id) - | Lparen _ -> Lparen (self :> entity_id) - | Rparen _ -> Rparen (self :> entity_id) - | RparenPlus _ -> RparenPlus (self :> entity_id) - | RparenStar _ -> RparenStar (self :> entity_id) - | RparenQmark _ -> RparenQmark (self :> entity_id) - | Conditional_begin _ -> Conditional_begin (self :> entity_id) - | Conditional_body _ -> Conditional_body (self :> entity_id) - | Conditional_end _ -> Conditional_end (self :> entity_id) - | Tag_beg (n,_) -> Tag_beg (n, (self :> entity_id)) - | Tag_end (n,_) -> Tag_end (n, (self :> entity_id)) - - (* End of file: *) - - | Eof -> - if debug then begin - prerr_endline ("- Entity " ^ name ^ " # handle_eof"); - let tok = self # handle_eof in - prerr_endline ("- Entity " ^ name ^ " # handle_eof: returns " ^ string_of_tok tok); - tok - end - else - self # handle_eof; - - (* The default case. *) - - | _ -> - tok - - in - if at_bof & tok <> Eof - then begin - if debug then - prerr_endline ("- Entity " ^ name ^ " # handle_bof"); - self # handle_bof tok' - end - else - tok' - end - - - (* 'handle_bof' and 'handle_eof' can be used as hooks. Behaviour: - * - * - Normally, the first token t is read in, and 'handle_bof t' is - * called. The return value of this method is what is returned to - * the user. - * - If the EOF has been reached, 'handle_eof' is called. - * - BUT: If the first token is already EOF, 'handle_eof' is called - * ONLY, and 'handle_bof' is NOT called. - * - * The default implementations: - * - handle_bof: does nothing - * - handle_eof: Pops the previous entity from the stack, switches back - * to this entity, and returns the next token of this entity. - *) - - - method private handle_bof tok = - tok - - - method private handle_eof = - let mng = self # manager in - begin try - mng # pop_entity; - let next_lex_id = self # close_entity in - let en = mng # current_entity in - en # set_lex_id next_lex_id; - en # next_token - with - Stack.Empty -> - (* The outermost entity is at EOF *) - Eof - end - - - method next_ignored_token = - (* used after Conditional_begin (self :> entity_id) - | Conditional_end _ -> Conditional_end (self :> entity_id) - | _ -> tok - - - method process_xmldecl pl = - (* The parser calls this method just after the XML declaration - * has been detected. - * 'pl': This is the argument of the PI_xml token. - *) - if debug then - prerr_endline ("- Entity " ^ name ^ " # process_xmldecl"); - prolog <- Some pl; - prolog_pairs <- decode_xml_pi pl; - if check_text_declaration then - check_text_xml_pi prolog_pairs; - begin - try - let e = List.assoc "encoding" prolog_pairs in - self # set_encoding e - with - Not_found -> - self # set_encoding "" - end; - - - method process_missing_xmldecl = - (* The parser calls this method if the XML declaration is missing *) - if debug then - prerr_endline ("- Entity " ^ name ^ " # process_missing_xmldecl"); - self # set_encoding "" - - - (* Methods for NDATA entities only: *) - method ext_id = (assert false : ext_id) - method notation = (assert false : string) - - end -;; - - -class ndata_entity the_name the_ext_id the_notation init_encoding = - object (self) - (* An NDATA entity is very restricted; more or less you can only find out - * its external ID and its notation. - *) - - val mutable name = the_name - val mutable ext_id = the_ext_id - val mutable notation = the_notation - val encoding = (init_encoding : rep_encoding) - - method name = (name : string) - method ext_id = (ext_id : ext_id) - method notation = (notation : string) - - method is_ndata = true - - method encoding = encoding - - - val mutable counts_as_external = false - - method counts_as_external = counts_as_external - (* Whether the entity counts as external (for the standalone check). *) - - method set_counts_as_external = - counts_as_external <- true - - - method set_manager (m : < current_entity : entity; - pop_entity : unit; - push_entity : entity -> unit >) = - ( raise (Validation_error ("Invalid reference to NDATA entity " ^ name)) - : unit ) - - method set_lex_id (id : lexers) = - ( raise (Validation_error ("Invalid reference to NDATA entity " ^ name)) - : unit ) - - method line = - ( raise (Validation_error ("Invalid reference to NDATA entity " ^ name)) - : int ) - - method column = - ( raise (Validation_error ("Invalid reference to NDATA entity " ^ name)) - : int ) - - method full_name = - ( raise (Validation_error ("Invalid reference to NDATA entity " ^ name)) - : string ) - - method private set_encoding (_:string) = - assert false - - method xml_declaration = (None : (string*string) list option) - - method set_debugging_mode (_:bool) = () - - method open_entity (_:bool) (_:lexers) = - ( raise (Validation_error ("Invalid reference to NDATA entity " ^ name)) - : unit ) - - method close_entity = - ( raise (Validation_error ("Invalid reference to NDATA entity " ^ name)) - : lexers ) - - method replacement_text = - ( raise (Validation_error ("Invalid reference to NDATA entity " ^ name)) - : (string * bool) ) - - method lexbuf = - ( raise (Validation_error ("Invalid reference to NDATA entity " ^ name)) - : Lexing.lexbuf ) - - method next_token = - ( raise (Validation_error ("Invalid reference to NDATA entity " ^ name)) - : token ) - - method next_ignored_token = - ( raise (Validation_error ("Invalid reference to NDATA entity " ^ name)) - : token ) - - method process_xmldecl (pl:prolog_token list) = - ( raise (Validation_error ("Invalid reference to NDATA entity " ^ name)) - : unit ) - - method process_missing_xmldecl = - ( raise (Validation_error ("Invalid reference to NDATA entity " ^ name)) - : unit ) - - end -;; - - -class external_entity the_resolver the_dtd the_name the_warner the_ext_id - the_p_special_empty_entities - init_errors_with_line_numbers - init_encoding - = - object (self) - inherit entity - the_dtd the_name the_warner init_errors_with_line_numbers - init_encoding - as super - - (* An external entity gets the lexbuf that is used as character source - * from a resolver. - * Furthermore, before the first token an Begin_entity is inserted, and - * before Eof an End_entity token is inserted into the stream. This done - * always regardless of the argument 'force_parsing' of the method - * 'open_entity'. - * - * 'the_p_internal_subset': see class internal_entity - * 'the_p_special_empty_entities': if true, a Begin_entity/End_entity - * brace is left out if the entity is otherwise empty. - *) - - val resolver = (the_resolver : resolver) - val ext_id = (the_ext_id : ext_id) - - val p_special_empty_entities = (the_p_special_empty_entities : bool) - - val mutable resolver_is_open = false - (* Track if the resolver is open. This is also used to find recursive - * references of entities. - *) - - val mutable state = At_beginning - - initializer - counts_as_external <- true; - - - method private set_encoding e = - assert resolver_is_open; - resolver # change_encoding e - - - method full_name = - name ^ - match ext_id with - System s -> " = SYSTEM \"" ^ s ^ "\"" - | Public(p,s) -> " = PUBLIC \"" ^ p ^ "\" \"" ^ s ^ "\"" - | Anonymous -> " = ANONYMOUS" - - - method open_entity force_parsing init_lex_id = - (* Note that external entities are always parsed, i.e. Begin_entity - * and End_entity tokens embrace the inner tokens to force that - * the entity is only called where the syntax allows it. - *) - if resolver_is_open then - raise(Validation_error("Recursive reference to entity `" ^ name ^ "'")); - let lex = - try - resolver # open_in ext_id - with - Pxp_reader.Not_competent -> - raise(Error ("No input method available for this external entity: " ^ - self # full_name)) - | Pxp_reader.Not_resolvable Not_found -> - raise(Error ("Unable to open the external entity: " ^ - self # full_name)) - | Pxp_reader.Not_resolvable e -> - raise(Error ("Unable to open the external entity: " ^ - self # full_name ^ "; reason: " ^ - string_of_exn e)) - in - resolver_is_open <- true; - lexbuf <- lex; - prolog <- None; - lex_id <- init_lex_id; - state <- At_beginning; - line <- 1; - column <- 0; - pos <- 0; - last_token <- Bof; - normalize_newline <- true; - - - method private handle_bof tok = - (* This hook is only called if the stream is not empty. *) - deferred_token <- Some [ tok ]; - state <- Inserted_begin_entity; - Begin_entity - - - method private handle_eof = - (* This hook is called if the end of the stream is reached *) - match state with - At_beginning -> - (* This is only possible if the stream is empty. *) - if p_special_empty_entities then begin - (* Continue immediately with the next token *) - state <- At_end; - super # handle_eof - end - else begin - (* Insert Begin_entity / End_entity *) - deferred_token <- Some [ End_entity ]; - state <- At_end; - Begin_entity; - (* After these two token have been processed, the lexer - * is called again, and it will return another Eof. - *) - end - | Inserted_begin_entity -> - (* Insert End_entity, too. *) - state <- At_end; - End_entity; - | At_end -> - (* Continue with the next token: *) - super # handle_eof - - - method close_entity = - if not resolver_is_open then - failwith ("External entity " ^ name ^ " not open"); - resolver # close_in; - resolver_is_open <- false; - lex_id - - - method replacement_text = - (* Return the replacement text of the entity. The method used for this - * is more or less the same as for internal entities; i.e. character - * and parameter entities are resolved immediately. In addition to that, - * external entities may begin with an "xml" processing instruction - * which is considered not to be part of the replacement text. - *) - if resolver_is_open then - raise(Validation_error("Recursive reference to entity `" ^ name ^ "'")); - let lex = resolver # open_in ext_id in - resolver_is_open <- true; - lexbuf <- lex; - prolog <- None; - (* arbitrary: lex_id <- init_lex_id; *) - state <- At_beginning; - line <- 1; - column <- 0; - pos <- 0; - last_token <- Bof; - (* First check if the first token of 'lex' is *) - begin match lexerset.scan_only_xml_decl lex with - PI_xml pl -> - self # process_xmldecl pl - | Eof -> - (* This only means that the first token was not ; - * the "Eof" token represents the empty string. - *) - self # process_missing_xmldecl - | _ -> - (* Must not happen. *) - assert false - end; - (* Then create the replacement text. *) - let rec scan_and_expand () = - match lexerset.scan_dtd_string lexbuf with - ERef n -> "&" ^ n ^ ";" ^ scan_and_expand() - | CRef(-1) -> "\n" ^ scan_and_expand() - | CRef(-2) -> "\n" ^ scan_and_expand() - | CRef(-3) -> "\n" ^ scan_and_expand() - | CRef k -> character encoding warner k ^ scan_and_expand() - | CharData x -> x ^ scan_and_expand() - | PERef n -> - let en = dtd # par_entity n in - let (x,_) = en # replacement_text in - x ^ scan_and_expand() - | Eof -> - "" - | _ -> - assert false - in - let rtext = scan_and_expand() in - resolver # close_in; - resolver_is_open <- false; - rtext, true - (* TODO: - * - The replaced text is not parsed [VALIDATION WEAKNESS] - *) - end -;; - - -class document_entity the_resolver the_dtd the_name the_warner the_ext_id - init_errors_with_line_numbers - init_encoding - = - object (self) - inherit external_entity the_resolver the_dtd the_name the_warner - the_ext_id false init_errors_with_line_numbers - init_encoding - - (* A document entity is an external entity that does not allow - * conditional sections, and that forces that internal parameter entities - * are properly nested. - *) - - initializer - force_parameter_entity_parsing <- true; - check_text_declaration <- false; - - method counts_as_external = false - (* Document entities count never as external! *) - end -;; - - -class internal_entity the_dtd the_name the_warner the_literal_value - the_p_internal_subset init_errors_with_line_numbers - init_is_parameter_entity - init_encoding - = - (* An internal entity uses a "literal entity value" as character source. - * This value is first expanded and preprocessed, i.e. character and - * parameter references are expanded. - * - * 'the_p_internal_subset': indicates that the entity is declared in the - * internal subset. Such entity declarations are not allowed to contain - * references to parameter entities. - * 'init_is_parameter_entity': whether this is a parameter entity or not - *) - - object (self) - inherit entity - the_dtd the_name the_warner init_errors_with_line_numbers - init_encoding - as super - - val p_internal_subset = the_p_internal_subset - - val mutable replacement_text = "" - val mutable contains_external_references = false - val mutable p_parsed_actually = false - val mutable is_open = false - val mutable state = At_beginning - val mutable is_parameter_entity = init_is_parameter_entity - - - initializer - let lexbuf = Lexing.from_string the_literal_value in - let rec scan_and_expand () = - match lexerset.scan_dtd_string lexbuf with - ERef n -> "&" ^ n ^ ";" ^ scan_and_expand() - | CRef(-1) -> "\r\n" ^ scan_and_expand() - | CRef(-2) -> "\r" ^ scan_and_expand() - | CRef(-3) -> "\n" ^ scan_and_expand() - | CRef k -> character encoding warner k ^ scan_and_expand() - | CharData x -> x ^ scan_and_expand() - | PERef n -> - if p_internal_subset then - raise(WF_error("Restriction of the internal subset: parameter entity not allowed here")); - let en = dtd # par_entity n in - let (x, extref) = en # replacement_text in - contains_external_references <- - contains_external_references or extref; - x ^ scan_and_expand() - | Eof -> - "" - | _ -> - assert false - in - is_open <- true; - replacement_text <- scan_and_expand(); - is_open <- false; - normalize_newline <- false; - counts_as_external <- false; - - - method process_xmldecl (pl:prolog_token list) = - raise(Validation_error("The encoding cannot be changed in internal entities")) - - - method process_missing_xmldecl = - () - - - method private set_encoding e = - (* Ignored if e = "" *) - assert(e = ""); - - - method open_entity force_parsing init_lex_id = - if is_open then - raise(Validation_error("Recursive reference to entity `" ^ name ^ "'")); - - p_parsed_actually <- force_parsing; - lexbuf <- Lexing.from_string - (if is_parameter_entity then - (" " ^ replacement_text ^ " ") - else - replacement_text); - prolog <- None; - lex_id <- init_lex_id; - state <- At_beginning; - is_open <- true; - line <- 1; - column <- 0; - pos <- 0; - last_token <- Eof; - - - method private handle_bof tok = - (* This hook is only called if the stream is not empty. *) - if p_parsed_actually then begin - deferred_token <- Some [ tok ]; - state <- Inserted_begin_entity; - Begin_entity - end - else begin - state <- At_end; - tok - end - - - method private handle_eof = - (* This hook is called if the end of the stream is reached *) - match state with - At_beginning -> - (* This is only possible if the stream is empty. *) - if p_parsed_actually then begin - (* Insert Begin_entity / End_entity *) - deferred_token <- Some [ End_entity ]; - state <- At_end; - Begin_entity; - (* After these two token have been processed, the lexer - * is called again, and it will return another Eof. - *) - end - else begin - (* Continue immediately with the next token *) - state <- At_end; - super # handle_eof - end - | Inserted_begin_entity -> - (* Insert End_entity, too. *) - state <- At_end; - End_entity; - | At_end -> - (* Continue with the next token: *) - super # handle_eof - - - method close_entity = - if not is_open then - failwith ("Internal entity " ^ name ^ " not open"); - is_open <- false; - lex_id - - - method replacement_text = - if is_open then - raise(Validation_error("Recursive reference to entity `" ^ name ^ "'")); - replacement_text, contains_external_references - end -;; - -(**********************************************************************) - -(* An 'entity_manager' is a stack of entities, where the topmost entity - * is the currently active entity, the second entity is the entity that - * referred to the active entity, and so on. - * - * The entity_manager can communicate with the currently active entity. - * - * The entity_manager provides an interface for the parser; the functions - * returning the current token and the next token are exported. - *) - -class entity_manager (init_entity : entity) = - object (self) - val mutable entity_stack = Stack.create() - val mutable current_entity = init_entity - val mutable current_entity's_full_name = lazy (init_entity # full_name) - - val mutable yy_get_next_ref = ref (fun () -> assert false) - - initializer - init_entity # set_manager (self :> - < current_entity : entity; - pop_entity : unit; - push_entity : entity -> unit > - ); - yy_get_next_ref := (fun () -> init_entity # next_token) - - method push_entity e = - e # set_manager (self :> - < current_entity : entity; - pop_entity : unit; - push_entity : entity -> unit > - ); - Stack.push (current_entity, current_entity's_full_name) entity_stack; - current_entity <- e; - current_entity's_full_name <- lazy (e # full_name); - yy_get_next_ref := (fun () -> e # next_token); - - method pop_entity = - (* May raise Stack.Empty *) - let e, e_name = Stack.pop entity_stack in - current_entity <- e; - current_entity's_full_name <- e_name; - yy_get_next_ref := (fun () -> e # next_token); - - - - method position_string = - (* Gets a string describing the position of the last token; - * includes an entity backtrace - *) - let b = Buffer.create 200 in - Buffer.add_string b - ("In entity " ^ current_entity # full_name - ^ ", at line " ^ string_of_int (current_entity # line) - ^ ", position " ^ string_of_int (current_entity # column) - ^ ":\n"); - Stack.iter - (fun (e, e_name) -> - Buffer.add_string b - ("Called from entity " ^ Lazy.force e_name - ^ ", line " ^ string_of_int (e # line) - ^ ", position " ^ string_of_int (e # column) - ^ ":\n"); - ) - entity_stack; - Buffer.contents b - - - method position = - (* Returns the triple (full_name, line, column) of the last token *) - Lazy.force current_entity's_full_name, - current_entity # line, - current_entity # column - - - method current_entity_counts_as_external = - (* Whether the current entity counts as external to the main - * document for the purpose of stand-alone checks. - *) - (* TODO: improve performance *) - let is_external = ref false in - let check (e, _) = - if e # counts_as_external then begin - is_external := true; - end; - in - check (current_entity,()); - Stack.iter check entity_stack; - !is_external - - - method current_entity = current_entity - - method yy_get_next_ref = yy_get_next_ref - - end -;; - - - -(* ====================================================================== - * History: - * - * $Log$ - * Revision 1.1 2000/11/17 09:57:29 lpadovan - * Initial revision - * - * Revision 1.6 2000/07/14 13:55:00 gerd - * Cosmetic changes. - * - * Revision 1.5 2000/07/09 17:51:50 gerd - * Entities return now the beginning of a token as its - * position. - * New method 'position' for entity_manager. - * - * Revision 1.4 2000/07/09 01:05:04 gerd - * Exported methods 'ext_id' and 'notation' anyway. - * - * Revision 1.3 2000/07/08 16:28:05 gerd - * Updated: Exception 'Not_resolvable' is taken into account. - * - * Revision 1.2 2000/07/04 22:12:47 gerd - * Update: Case ext_id = Anonymous. - * Update: Handling of the exception Not_competent when reading - * from a resolver. - * - * Revision 1.1 2000/05/29 23:48:38 gerd - * Changed module names: - * Markup_aux into Pxp_aux - * Markup_codewriter into Pxp_codewriter - * Markup_document into Pxp_document - * Markup_dtd into Pxp_dtd - * Markup_entity into Pxp_entity - * Markup_lexer_types into Pxp_lexer_types - * Markup_reader into Pxp_reader - * Markup_types into Pxp_types - * Markup_yacc into Pxp_yacc - * See directory "compatibility" for (almost) compatible wrappers emulating - * Markup_document, Markup_dtd, Markup_reader, Markup_types, and Markup_yacc. - * - * ====================================================================== - * Old logs from markup_entity.ml: - * - * Revision 1.27 2000/05/29 21:14:57 gerd - * Changed the type 'encoding' into a polymorphic variant. - * - * Revision 1.26 2000/05/28 17:24:55 gerd - * Bugfixes. - * - * Revision 1.25 2000/05/27 19:23:32 gerd - * The entities store whether they count as external with - * respect to the standalone check: New methods counts_as_external - * and set_counts_as_external. - * The entity manager can find out whether the current - * entity counts as external: method current_entity_counts_as_external. - * - * Revision 1.24 2000/05/20 20:31:40 gerd - * Big change: Added support for various encodings of the - * internal representation. - * - * Revision 1.23 2000/05/14 21:51:24 gerd - * Change: Whitespace is handled by the grammar, and no longer - * by the entity. - * - * Revision 1.22 2000/05/14 17:50:54 gerd - * Updates because of changes in the token type. - * - * Revision 1.21 2000/05/09 00:02:44 gerd - * Conditional sections are now recognized by the parser. - * There seem some open questions; see the TODO comments! - * - * Revision 1.20 2000/05/08 21:58:22 gerd - * Introduced entity_manager as communication object between - * the parser and the currently active entity. - * New hooks handle_bof and handle_eof. - * Removed "delegated entities". The entity manager contains - * the stack of open entities. - * Changed the way Begin_entity and End_entity are inserted. - * This is now done by handle_bof and handle_eof. - * The XML declaration is no longer detected by the entity. - * This is now done by the parser. - * - * Revision 1.19 2000/05/01 15:18:44 gerd - * Improved CRLF handling in the replacement text of entities. - * Changed one error message. - * - * Revision 1.18 2000/04/30 18:18:39 gerd - * Bugfixes: The conversion of CR and CRLF to LF is now hopefully - * done right. The new variable "normalize_newline" indicates whether - * normalization must happen for that type of entity. The normalization - * if actually carried out separately for every token that needs it. - * - * Revision 1.17 2000/03/13 23:42:38 gerd - * Removed the resolver classes, and put them into their - * own module (Markup_reader). - * - * Revision 1.16 2000/02/22 01:06:58 gerd - * Bugfix: Resolvers are properly re-initialized. This bug caused - * that entities could not be referenced twice in the same document. - * - * Revision 1.15 2000/01/20 20:54:11 gerd - * New config.errors_with_line_numbers. - * - * Revision 1.14 2000/01/08 18:59:03 gerd - * Corrected the string resolver. - * - * Revision 1.13 1999/09/01 22:58:23 gerd - * Method warn_not_latin1 raises Illegal_character if the character - * does not match the Char production. - * External entities that are not document entities check if the - * declaration at the beginning matches the TextDecl production. - * Method xml_declaration has type ... list option, not ... list. - * Tag_beg and Tag_end now carry an entity_id with them. - * The code to check empty entities has changed. That the Begin_entity/ - * End_entity pair is not to be added must be explicitly turned on. See the - * description of empty entity handling in design.txt. - * In internal subsets entity declarations are not allowed to refer - * to parameter entities. The internal_entity class can do this now. - * The p_parsed parameter of internal_entity has gone. It was simply - * superflous. - * - * Revision 1.12 1999/09/01 16:24:13 gerd - * The method replacement_text returns the text as described for - * "included in literal". The former behaviour has been dropped to include - * a leading and a trailing space character for parameter entities. - * Bugfix: When general entities are included, they are always parsed. - * - * Revision 1.11 1999/08/31 19:13:31 gerd - * Added checks on proper PE nesting. The idea is that tokens such - * as Decl_element and Decl_rangle carry an entity ID with them. This ID - * is simply an object of type < >, i.e. you can only test on identity. - * The lexer always produces tokens with a dummy ID because it does not - * know which entity is the current one. The entity layer replaces the dummy - * ID with the actual ID. The parser checks that the IDs of pairs such as - * Decl_element and Decl_rangle are the same; otherwise a Validation_error - * is produced. - * - * Revision 1.10 1999/08/19 01:06:41 gerd - * Improved error messages: external entities print their - * ext id, too - * - * Revision 1.9 1999/08/15 20:35:48 gerd - * Improved error messages. - * Before the tokens Plus, Star, Qmark space is not allowed any longer. - * Detection of recursive entity references is a bit cleaner. - * - * Revision 1.8 1999/08/15 15:33:44 gerd - * Revised whitespace checking: At certain positions there must be - * white space. These checks cannot be part of the lexer, as %entity; counts - * as white space. They cannot be part of the yacc parser because one look-ahead - * token would not suffice if we did that. So these checks must be done by the - * entity layer. Luckily, the rules are simple: There are simply a number of - * token pairs between which white space must occur independently of where - * these token have been found. Two variables, "space_seen", and "last_token" - * have been added in order to check these rules. - * - * Revision 1.7 1999/08/15 00:41:06 gerd - * The [ token of conditional sections is now allowed to occur - * in a different entity. - * - * Revision 1.6 1999/08/15 00:29:02 gerd - * The method "attlist_replacement_text" has gone. There is now a - * more general "replacement_text" method that computes the replacement - * text for both internal and external entities. Additionally, this method - * returns whether references to external entities have been resolved; - * this is checked in the cases where formerly "attlist_replacement_text" - * was used as it is not allowed everywhere. - * Entities have a new slot "need_spaces" that indicates that the - * next token must be white space or a parameter reference. The problem - * was that "