X-Git-Url: http://matita.cs.unibo.it/gitweb/?a=blobdiff_plain;f=helm%2FDEVEL%2Fpxp%2Fpxp%2Fpxp_aux.ml;fp=helm%2FDEVEL%2Fpxp%2Fpxp%2Fpxp_aux.ml;h=aa1212b6f3ef96c6f1e024f54ea9805e947adc04;hb=c03d2c1fdab8d228cb88aaba5ca0f556318bebc5;hp=0000000000000000000000000000000000000000;hpb=758057e85325f94cd88583feb1fdf6b038e35055;p=helm.git diff --git a/helm/DEVEL/pxp/pxp/pxp_aux.ml b/helm/DEVEL/pxp/pxp/pxp_aux.ml new file mode 100644 index 000000000..aa1212b6f --- /dev/null +++ b/helm/DEVEL/pxp/pxp/pxp_aux.ml @@ -0,0 +1,651 @@ +(* $Id$ + * ---------------------------------------------------------------------- + * PXP: The polymorphic XML parser for Objective Caml. + * Copyright by Gerd Stolpmann. See LICENSE for details. + * Some auxiliary functions + *) + +(**********************************************************************) +(* Lexing *) + + +open Pxp_types +open Pxp_lexer_types +open Pxp_lexers +open Netconversion + +let character enc warner k = + assert (k>=0); + if (k >= 0xd800 & k < 0xe000) or (k >= 0xfffe & k <= 0xffff) or k > 0x10ffff + or (k < 8) or (k = 11) or (k = 12) or (k >= 14 & k <= 31) + then + raise (WF_error("Code point " ^ string_of_int k ^ + " outside the accepted range of code points")); + + try + makechar (enc : rep_encoding :> encoding) k + with + Not_found -> + warner # warn ("Code point cannot be represented in internal encoding: " + ^ string_of_int k); + "" +;; + + +let check_name warner name = + (* produces a warning for names beginning with "xml". *) + if String.length name >= 3 then begin + match String.sub name 0 3 with + ("xml" | "xmL" | "xMl" | "xML" | "Xml" | "XmL" | "XMl" | "XML") -> + warner # warn ("Name is reserved for future extensions: " ^ name) + | _ -> + () + end +;; + + +let tokens_of_content_string lexerset s = + (* tokenizes general entities and character entities *) + let lexbuf = Lexing.from_string s in + let rec next_token () = + match lexerset.scan_content_string lexbuf with + Eof -> [] + | tok -> tok :: next_token() + in + next_token() +;; + + +let rec expand_attvalue_with_rec_check lexerset dtd s warner entities norm_crlf = + (* recursively expands general entities and character entities; + * checks "standalone" document declaration; + * normalizes whitespace + *) + let toklist = tokens_of_content_string lexerset s in + let rec expand tl = + match tl with + [] -> "" + | ERef n :: tl' -> + if List.mem n entities then + raise(WF_error("Recursive reference to general entity `" ^ n ^ "'")); + let en, extdecl = dtd # gen_entity n in + if dtd # standalone_declaration && extdecl then + raise(Validation_error("Reference to entity `" ^ n ^ + "' violates standalone declaration")); + let rtext, rtext_contains_ext_refs = en # replacement_text in + if rtext_contains_ext_refs then + raise(Validation_error("Found reference to external entity in attribute value")); + expand_attvalue_with_rec_check + lexerset dtd rtext warner (n :: entities) false ^ expand tl' + | CRef(-1) :: tl' -> + if norm_crlf then + " " ^ expand tl' + else + " " ^ expand tl' + | CRef n :: tl' -> + character lexerset.lex_encoding warner n ^ expand tl' + | CharData "<" :: tl' -> + raise + (WF_error + ("Attribute value contains character '<' literally")) + | CharData x :: tl' -> + x ^ expand tl' + | _ -> assert false + in + expand toklist +;; + + +let expand_attvalue lexerset dtd s warner norm_crlf = + (* norm_crlf: whether the sequence CRLF is recognized as one character or + * not (i.e. two characters) + *) + expand_attvalue_with_rec_check lexerset dtd s warner [] norm_crlf +;; + + +let count_lines s = + (* returns number of lines in s, number of columns of the last line *) + let l = String.length s in + + let rec count n k no_cr no_lf = + let next_cr = + if no_cr then + (-1) + else + try String.index_from s k '\013' with Not_found -> (-1) in + let next_lf = + if no_lf then + (-1) + else + try String.index_from s k '\010' with Not_found -> (-1) in + if next_cr >= 0 & (next_lf < 0 or next_cr < next_lf) then begin + if next_cr+1 < l & s.[next_cr+1] = '\010' then + count (n+1) (next_cr+2) false (next_lf < 0) + else + count (n+1) (next_cr+1) false (next_lf < 0) + end + else if next_lf >= 0 then begin + count (n+1) (next_lf+1) (next_cr < 0) false + end + else + n, (l - k) + + in + count 0 0 false false +;; + + +let tokens_of_xml_pi lexers s = + let lexbuf = Lexing.from_string (s ^ " ") in + let rec collect () = + let t = lexers.scan_xml_pi lexbuf in + match t with + Pro_eof -> [] + | _ -> t :: collect() + in + collect() +;; + + +let decode_xml_pi pl = + (* 'pl' must consist of name="value" or name='value' pairs which are returned + * as list of pairs. + * The "value" is returned as it is; no substitution of &entities; happens. + *) + let rec decode pl = + match pl with + Pro_name name :: Pro_eq :: Pro_string value :: pl' -> + (name, value) :: decode pl' + | [] -> + [] + | _ -> + raise (WF_error("Bad XML processing instruction")) + in + decode pl +;; + + +let decode_doc_xml_pi pl = + match pl with + [ "version", v ] -> (v, None, None) + | [ "version", v; "encoding", e ] -> (v, Some e, None) + | [ "version", v; "standalone", s ] -> (v, None, Some s) + | [ "version", v; "encoding", e; "standalone", s ] -> (v, Some e, Some s) + | _ -> + raise(WF_error("Bad XML declaration")) +;; + + +let check_text_xml_pi pl = + match pl with + | [ "version", v; "encoding", e ] -> () + | [ "encoding", e ] -> () + | _ -> + raise(WF_error("Bad XML declaration")) +;; + + +let check_version_num s = + let l = String.length s in + for i = 0 to l - 1 do + match s.[i] with + ('a'..'z'|'A'..'Z'|'0'..'9'| + '-'|'_'|'.'|':') -> () + | _ -> + raise(WF_error("Bad XML version string")) + done +;; + + +let check_public_id s = + let l = String.length s in + for i = 0 to l - 1 do + match s.[i] with + (' '|'\013'|'\010'|'a'..'z'|'A'..'Z'|'0'..'9'| + '-'|'\''|'('|')'|'+'|','|'.'|'/'|':'|'='|'?'| + ';'|'!'|'*'|'#'|'@'|'$'|'_'|'%') -> () + | _ -> + raise(WF_error("Illegal character in PUBLIC identifier")) + done +;; + + +(**********************************************************************) +(* list functions *) + + +let rec check_dups l = + match l with + [] -> false + | c :: l' -> + if List.mem c l' then true else check_dups l' +;; + + +let rec count pred l = + match l with + [] -> 0 + | x :: l' -> + if pred x then 1 + (count pred l') else count pred l' +;; + + +(**********************************************************************) +(* attributes *) + +let check_attribute_value_lexically lexerset x t v = + (* raises x if the attribute value v does not match the lexical rules + * for attribute type t: + * - t = A_id: v must be a + * - t = A_idref: v must match + * - t = A_idrefs: v must match + * - t = A_entity: v must match + * - t = A_entities: v must match + * - t = A_nmtoken: v must match + * - t = A_nmtokens: v must match + * - t = A_notation _: v must match + * - t = A_enum _: v must match + * - t = A_cdata: not checked + *) + let lexbuf = Lexing.from_string v in + let rec get_name_list() = + match lexerset.scan_name_string lexbuf with + Eof -> [] + | Ignore -> get_name_list() + | tok -> tok :: get_name_list() + in + let l = get_name_list() in + match t with + (A_id | A_idref | A_entity | A_notation _) -> + begin match l with + [ Name n ] -> () + | _ -> raise (Lazy.force x) + end + | (A_idrefs | A_entities) -> + if List.exists (fun tok -> + match tok with + Name _ -> false + | _ -> true) l then + raise (Lazy.force x) + | (A_nmtoken | A_enum _) -> + begin match l with + [ Name n ] -> () + | [ Nametoken n ] -> () + | _ -> raise (Lazy.force x) + end + | A_nmtokens -> + if List.exists (fun tok -> + match tok with + Name _ -> false + | Nametoken _ -> false + | _ -> true + ) l then + raise (Lazy.force x) + | _ -> () +;; + + +let split_attribute_value lexerset v = + (* splits 'v' into a list of names or nmtokens. The white space separating + * the names/nmtokens in 'v' is suppressed and not returned. + *) + let lexbuf = Lexing.from_string v in + let rec get_name_list() = + match lexerset.scan_name_string lexbuf with + Eof -> [] + | Ignore -> get_name_list() + | Name s -> s :: get_name_list() + | Nametoken s -> s :: get_name_list() + | _ -> raise(Validation_error("Illegal attribute value")) + in + get_name_list() +;; + + +let normalize_line_separators lexerset s = + let lexbuf = Lexing.from_string s in + let rec get_string() = + match lexerset.scan_for_crlf lexbuf with + Eof -> "" + | CharData s -> s ^ get_string() + | _ -> assert false + in + get_string() +;; + + +let value_of_attribute lexerset dtd n atype v = + (* The attribute with name 'n', type 'atype' and string value 'v' is + * decomposed, and the att_value is returned: + * - It is checked whether 'v' conforms to the lexical rules for attributes + * of type 'atype' + * - If 'atype <> A_cdata', leading and trailing spaces are removed from 'v'. + * - If 'atype = A_notation d', it is checked if 'v' matches one of the + * notation names contained in d. + * - If 'atype = A_enum d', it is checked whether 'v' matches one of the + * tokens from d + * - If 'atype' refers to a "single-value" type, the value is retured as + * Value u, where u is the normalized value. If 'atype' refers to a + * "list" type, the value if returned as Valuelist l, where l contains + * the tokens. + * + * Note that this function does not implement all normalization rules. + * It is expected that the string passed as 'v' is already preprocessed; + * i.e. character and entity references are resolved, and the substitution + * of white space characters by space characters has already been performed. + * If these requirements are met, the value returned by this function + * will be perfectly normalized. + * + * Further checks: + * - ENTITY and ENTITIES values: It is checked whether there is an + * unparsed general entity + * [ Other checks planned: ID, IDREF, IDREFS but not yet implemented ] + *) + + let lexical_error() = + lazy (raise(Validation_error("Attribute `" ^ n ^ "' is lexically malformed"))) in + + let remove_leading_and_trailing_spaces u = + (* Precondition: 'u' matches or *) + match split_attribute_value lexerset u with + [ u' ] -> u' + | _ -> assert false + in + + let check_ndata_entity u = + let en, extdecl = dtd # gen_entity u in (* or Validation_error *) + if not (en # is_ndata) then + raise(Validation_error("Reference to entity `" ^ u ^ + "': NDATA entity expected")); + if dtd # standalone_declaration && extdecl then + raise(Validation_error("Reference to entity `" ^ u ^ + "' violates standalone declaration")); + in + + match atype with + A_cdata -> + Value v + + | (A_id | A_idref | A_nmtoken) -> + check_attribute_value_lexically lexerset (lexical_error()) atype v; + Value (remove_leading_and_trailing_spaces v) + | A_entity -> + check_attribute_value_lexically lexerset (lexical_error()) atype v; + let v' = remove_leading_and_trailing_spaces v in + check_ndata_entity v'; + Value v' + + | (A_idrefs | A_nmtokens) -> + check_attribute_value_lexically lexerset (lexical_error()) atype v; + Valuelist (split_attribute_value lexerset v) + + | A_entities -> + check_attribute_value_lexically lexerset (lexical_error()) atype v; + let l = split_attribute_value lexerset v in + List.iter check_ndata_entity l; + Valuelist l + + | A_notation nl -> + check_attribute_value_lexically lexerset (lexical_error()) atype v; + let v' = remove_leading_and_trailing_spaces v in + if not (List.mem v' nl) then + raise(Validation_error + ("Attribute `" ^ n ^ + "' does not match one of the declared notation names")); + Value v' + + | A_enum enuml -> + check_attribute_value_lexically lexerset (lexical_error()) atype v; + let v' = remove_leading_and_trailing_spaces v in + if not (List.mem v' enuml) then + raise(Validation_error + ("Attribute `" ^ n ^ + "' does not match one of the declared enumerator tokens")); + Value v' +;; + + +let normalization_changes_value lexerset atype v = + (* Returns true if: + * - 'atype' is a "single-value" type, and the normalization of the string + * value 'v' of this type discards leading and/or trailing spaces + * - 'atype' is a "list" type, and the normalization of the string value + * 'v' of this type discards leading and/or trailing spaces, or spaces + * separating the tokens of the list (i.e. the normal form is that + * the tokens are separated by exactly one space character). + * + * Note: It is assumed that TABs, CRs, and LFs in 'v' are already converted + * to spaces. + *) + + match atype with + A_cdata -> + false + + | (A_id | A_idref | A_entity | A_nmtoken | A_notation _ | A_enum _) -> + (* Return 'true' if the first or last character is a space. + * The following check works for both ISO-8859-1 and UTF-8. + *) + v <> "" && (v.[0] = ' ' || v.[String.length v - 1] = ' ') + + | (A_idrefs | A_entities | A_nmtokens) -> + (* Split the list, and concatenate the tokens as required by + * the normal form. Return 'true' if this operation results in + * a different string than 'v'. + * This check works for both ISO-8859-1 and UTF-8. + *) + let l = split_attribute_value lexerset v in + let v' = String.concat " " l in + v <> v' +;; + + +(**********************************************************************) + +let write_markup_string ~(from_enc:rep_encoding) ~to_enc os s = + (* Write the 'from_enc'-encoded string 's' as 'to_enc'-encoded string to + * 'os'. All characters are written as they are. + *) + let s' = + if to_enc = (from_enc :> encoding) + then s + else recode_string + ~in_enc:(from_enc :> encoding) + ~out_enc:to_enc + ~subst:(fun n -> + failwith + ("Pxp_aux.write_markup_string: Cannot represent " ^ + "code point " ^ string_of_int n)) + s + in + write os s' 0 (String.length s') +;; + + +let write_data_string ~(from_enc:rep_encoding) ~to_enc os content = + (* Write the 'from_enc'-encoded string 's' as 'to_enc'-encoded string to + * 'os'. The characters '&', '<', '>', '"', '%' and every character that + * cannot be represented in 'to_enc' are paraphrased as entity reference + * "&...;". + *) + let convert_ascii s = + (* Convert the ASCII-encoded string 's'. Note that 'from_enc' is + * always ASCII-compatible + *) + if to_enc = (from_enc :> encoding) + then s + else + recode_string + ~in_enc:(from_enc :> encoding) + ~out_enc:to_enc + ~subst:(fun n -> assert false) + s + in + + let write_ascii s = + (* Write the ASCII-encoded string 's' *) + let s' = convert_ascii s in + write os s' 0 (String.length s') + in + + let write_part j l = + (* Writes the substring of 'content' beginning at pos 'j' with length 'l' + *) + if to_enc = (from_enc :> encoding) then + write os content j l + else begin + let s' = recode_string + ~in_enc:(from_enc :> encoding) + ~out_enc:to_enc + ~subst:(fun n -> + convert_ascii ("&#" ^ string_of_int n ^ ";")) + (String.sub content j l) + in + write os s' 0 (String.length s') + end + in + + let i = ref 0 in + for k = 0 to String.length content - 1 do + match content.[k] with + ('&' | '<' | '>' | '"' | '%') as c -> + if !i < k then + write_part !i (k - !i); + begin match c with + '&' -> write_ascii "&" + | '<' -> write_ascii "<" + | '>' -> write_ascii ">" + | '"' -> write_ascii """ + | '%' -> write_ascii "%" (* reserved in DTDs *) + | _ -> assert false + end; + i := k+1 + | _ -> () + done; + if !i < String.length content then + write_part !i (String.length content - !i) +;; + + +(* ====================================================================== + * History: + * + * $Log$ + * Revision 1.1 2000/11/17 09:57:29 lpadovan + * Initial revision + * + * Revision 1.6 2000/08/14 22:24:55 gerd + * Moved the module Pxp_encoding to the netstring package under + * the new name Netconversion. + * + * Revision 1.5 2000/07/25 00:30:01 gerd + * Added support for pxp:dtd PI options. + * + * Revision 1.4 2000/07/16 18:31:09 gerd + * The exception Illegal_character has been dropped. + * + * Revision 1.3 2000/07/16 16:33:57 gerd + * New function write_markup_string: Handles the encoding + * of the string. + * + * Revision 1.2 2000/07/08 22:15:45 gerd + * [Merging 0.2.10:] write_data_string: The character '%' is special, too. + * + * Revision 1.1 2000/05/29 23:48:38 gerd + * Changed module names: + * Markup_aux into Pxp_aux + * Markup_codewriter into Pxp_codewriter + * Markup_document into Pxp_document + * Markup_dtd into Pxp_dtd + * Markup_entity into Pxp_entity + * Markup_lexer_types into Pxp_lexer_types + * Markup_reader into Pxp_reader + * Markup_types into Pxp_types + * Markup_yacc into Pxp_yacc + * See directory "compatibility" for (almost) compatible wrappers emulating + * Markup_document, Markup_dtd, Markup_reader, Markup_types, and Markup_yacc. + * + * ====================================================================== + * Old logs from markup_aux.ml: + * + * Revision 1.12 2000/05/27 19:08:30 gerd + * Added functionality to check standalone declaration: + * + * expand_attvalue: Checks whether included entities violate the + * stand-alone declaration. + * + * value_of_attribute: Checks whether ENTITY/ENTITIES values violate + * this declaration. (Furthermore, it is checked whether the NDATA + * entity exists - this has been forgotten in previous versions.) + * + * value_of_attribute/check_attribute_value_lexically: improved. + * + * New function normalization_changes_value: helps detecting + * one case which violates the standalone declaration. + * + * Revision 1.11 2000/05/20 20:31:40 gerd + * Big change: Added support for various encodings of the + * internal representation. + * + * Revision 1.10 2000/05/01 20:41:56 gerd + * New function write_data_string. + * + * Revision 1.9 2000/04/30 18:11:31 gerd + * New function normalize_line_separators. + * In function expand_attvalue: New argument norm_crlf. If the attvalue + * is read directly from a file, the sequence CR LF must be converted to a + * single space. If the attvalue is read from a replacement text, CR LF has + * already converted to a single LF, and CR LF, if still occurring, must be + * converted to two spaces. The caller can indicate the case by passing + * true/false as norm_crlf. + * + * Revision 1.8 1999/09/01 22:51:07 gerd + * Added functions. + * 'character' raises Illegal_character if characters are found that + * do not match the production Char. + * + * Revision 1.7 1999/09/01 16:17:37 gerd + * Added function 'check_name'. + * + * Revision 1.6 1999/08/15 20:33:19 gerd + * Added: a function that checks public identifiers. Only certain + * characters may occur in these identifiers. + * Control characters are rejected by the "character" function. + * Bugfix: recursive entity references are detected in attribute + * expansion + * + * Revision 1.5 1999/08/15 02:18:02 gerd + * That '<' is not allowed in attribute values, is a violation + * of well-formedness, not of the validity; so WF_error is raised. + * + * Revision 1.4 1999/08/15 00:20:37 gerd + * When expanding attribute values, references to parameter + * entities are now resolved by the method "replacement_text" which + * has an additional return value, and no longer by "attlist_replacement_text". + * The new return value indicates whether references to external entities + * have been resolved (directly or indirectly); this is allowed at some + * locations but not in attribute values. + * + * Revision 1.3 1999/08/14 22:05:53 gerd + * Several functions have now a "warner" as argument which is + * an object with a "warn" method. This is used to warn about characters + * that cannot be represented in the Latin 1 alphabet. + * + * Revision 1.2 1999/08/10 21:35:06 gerd + * The XML/encoding declaration at the beginning of entities is + * evaluated. In particular, entities have now a method "xml_declaration" + * which returns the name/value pairs of such a declaration. The "encoding" + * setting is interpreted by the entity itself; "version", and "standalone" + * are interpreted by Markup_yacc.parse_document_entity. Other settings + * are ignored (this does not conform to the standard; the standard prescribes + * that "version" MUST be given in the declaration of document; "standalone" + * and "encoding" CAN be declared; no other settings are allowed). + * TODO: The user should be warned if the standard is not exactly + * fulfilled. -- The "standalone" property is not checked yet. + * + * Revision 1.1 1999/08/10 00:35:50 gerd + * Initial revision. + * + * + *)