Initial revision

[helm.git] / helm / DEVEL / pxp / pxp / pxp_aux.ml
diff --git a/helm/DEVEL/pxp/pxp/pxp_aux.ml b/helm/DEVEL/pxp/pxp/pxp_aux.ml

new file mode 100644 (file)

index 0000000..aa1212b
--- /dev/null
+++ b/helm/DEVEL/pxp/pxp/pxp_aux.ml
@@ -0,0 +1,651 @@
+(* $Id$
+ * ----------------------------------------------------------------------
+ * PXP: The polymorphic XML parser for Objective Caml.
+ * Copyright by Gerd Stolpmann. See LICENSE for details.
+ * Some auxiliary functions 
+ *)
+
+(**********************************************************************)
+(* Lexing *)
+
+
+open Pxp_types
+open Pxp_lexer_types
+open Pxp_lexers
+open Netconversion
+
+let character enc warner k =
+  assert (k>=0);
+  if (k >= 0xd800 & k < 0xe000) or (k >= 0xfffe & k <= 0xffff) or k > 0x10ffff
+     or (k < 8) or (k = 11) or (k = 12) or (k >= 14 & k <= 31)
+  then
+    raise (WF_error("Code point " ^ string_of_int k ^ 
+                   " outside the accepted range of code points"));
+
+  try
+    makechar (enc : rep_encoding :> encoding) k
+  with
+      Not_found ->
+       warner # warn ("Code point cannot be represented in internal encoding: "
+                      ^ string_of_int k);
+       ""
+;;
+
+
+let check_name warner name =
+  (* produces a warning for names beginning with "xml". *)
+  if String.length name >= 3 then begin
+    match String.sub name 0 3 with
+       ("xml" | "xmL" | "xMl" | "xML" | "Xml" | "XmL" | "XMl" | "XML") ->
+         warner # warn ("Name is reserved for future extensions: " ^ name)
+      | _ ->
+         ()
+  end
+;;
+
+
+let tokens_of_content_string lexerset s =
+  (* tokenizes general entities and character entities *)
+  let lexbuf = Lexing.from_string s in
+  let rec next_token () =
+    match lexerset.scan_content_string lexbuf with
+       Eof -> []
+      | tok -> tok :: next_token()
+  in
+  next_token()
+;;
+
+
+let rec expand_attvalue_with_rec_check lexerset dtd s warner entities norm_crlf =
+  (* recursively expands general entities and character entities;
+   * checks "standalone" document declaration;
+   * normalizes whitespace
+   *)
+  let toklist = tokens_of_content_string lexerset s in
+  let rec expand tl =
+    match tl with
+       [] -> ""
+      | ERef n :: tl' ->
+         if List.mem n entities then
+           raise(WF_error("Recursive reference to general entity `" ^ n ^ "'"));
+         let en, extdecl = dtd # gen_entity n in
+         if dtd # standalone_declaration && extdecl then
+           raise(Validation_error("Reference to entity `" ^ n ^ 
+                                  "' violates standalone declaration"));
+         let rtext, rtext_contains_ext_refs = en # replacement_text in
+         if rtext_contains_ext_refs then
+           raise(Validation_error("Found reference to external entity in attribute value"));
+         expand_attvalue_with_rec_check 
+           lexerset dtd rtext warner (n :: entities) false    ^    expand tl'
+      | CRef(-1) :: tl' ->
+         if norm_crlf then
+           " " ^ expand tl'
+         else
+           "  " ^ expand tl'
+      | CRef n :: tl' ->
+         character lexerset.lex_encoding warner n ^ expand tl'
+      | CharData "<" :: tl' ->
+         raise 
+           (WF_error
+              ("Attribute value contains character '<' literally"))
+      | CharData x :: tl' ->
+         x ^ expand tl'
+      | _ -> assert false
+  in
+  expand toklist
+;;
+
+
+let expand_attvalue lexerset dtd s warner norm_crlf =
+  (* norm_crlf: whether the sequence CRLF is recognized as one character or
+   * not (i.e. two characters)
+   *)
+  expand_attvalue_with_rec_check lexerset dtd s warner [] norm_crlf
+;;
+
+
+let count_lines s =
+  (* returns number of lines in s, number of columns of the last line *)
+  let l = String.length s in
+
+  let rec count n k no_cr no_lf =
+    let next_cr = 
+      if no_cr then
+       (-1)
+      else
+       try String.index_from s k '\013' with Not_found -> (-1) in
+    let next_lf = 
+      if no_lf then
+       (-1)
+      else
+       try String.index_from s k '\010' with Not_found -> (-1) in
+    if next_cr >= 0 & (next_lf < 0 or next_cr < next_lf) then begin
+      if next_cr+1 < l & s.[next_cr+1] = '\010' then
+       count (n+1) (next_cr+2) false (next_lf < 0)
+      else
+       count (n+1) (next_cr+1) false (next_lf < 0)
+    end
+    else if next_lf >= 0 then begin
+      count (n+1) (next_lf+1) (next_cr < 0) false
+    end
+    else
+      n, (l - k)
+
+  in
+  count 0 0 false false
+;;
+
+
+let tokens_of_xml_pi lexers s =
+  let lexbuf = Lexing.from_string (s ^ " ") in
+  let rec collect () =
+    let t = lexers.scan_xml_pi lexbuf in
+    match t with
+       Pro_eof -> []
+      | _       -> t :: collect()
+  in
+  collect()
+;;
+
+
+let decode_xml_pi pl =
+  (* 'pl' must consist of name="value" or name='value' pairs which are returned
+   * as list of pairs.
+   * The "value" is returned as it is; no substitution of &entities; happens.
+   *)
+  let rec decode pl =
+    match pl with
+       Pro_name name :: Pro_eq :: Pro_string value :: pl' ->
+         (name, value) :: decode pl'
+      | [] ->
+         []
+      | _ ->
+         raise (WF_error("Bad XML processing instruction"))
+  in
+  decode pl
+;;
+
+
+let decode_doc_xml_pi pl =
+  match pl with
+      [ "version", v ]                                  -> (v, None, None)
+    | [ "version", v; "encoding", e ]                   -> (v, Some e, None)
+    | [ "version", v; "standalone", s ]                 -> (v, None, Some s)
+    | [ "version", v; "encoding", e; "standalone", s ]  -> (v, Some e, Some s)
+    | _ ->
+       raise(WF_error("Bad XML declaration"))
+;;
+
+
+let check_text_xml_pi pl =
+  match pl with
+    | [ "version", v; "encoding", e ] -> ()
+    | [ "encoding", e ]  -> ()
+    | _ ->
+       raise(WF_error("Bad XML declaration"))
+;;
+
+
+let check_version_num s =
+  let l = String.length s in
+  for i = 0 to l - 1 do
+    match s.[i] with
+       ('a'..'z'|'A'..'Z'|'0'..'9'|
+        '-'|'_'|'.'|':') -> ()
+      | _ ->
+         raise(WF_error("Bad XML version string"))
+  done
+;;
+
+
+let check_public_id s =
+  let l = String.length s in
+  for i = 0 to l - 1 do
+    match s.[i] with
+       (' '|'\013'|'\010'|'a'..'z'|'A'..'Z'|'0'..'9'|
+        '-'|'\''|'('|')'|'+'|','|'.'|'/'|':'|'='|'?'|
+        ';'|'!'|'*'|'#'|'@'|'$'|'_'|'%') -> ()
+      | _ ->
+         raise(WF_error("Illegal character in PUBLIC identifier"))
+  done
+;;
+
+
+(**********************************************************************)
+(* list functions *)
+
+
+let rec check_dups l =
+  match l with
+      [] -> false
+    | c :: l' -> 
+       if List.mem c l' then true else check_dups l'
+;;
+
+
+let rec count pred l =
+  match l with
+      [] -> 0
+    | x :: l' -> 
+       if pred x then  1 + (count pred l') else count pred l'
+;;
+
+
+(**********************************************************************)
+(* attributes *)
+
+let check_attribute_value_lexically lexerset x t v =
+  (* raises x if the attribute value v does not match the lexical rules
+   * for attribute type t:
+   * - t = A_id: v must be a <name>
+   * - t = A_idref: v must match <name>
+   * - t = A_idrefs: v must match <names>
+   * - t = A_entity: v must match <name>
+   * - t = A_entities: v must match <names>
+   * - t = A_nmtoken: v must match <nmtoken>
+   * - t = A_nmtokens: v must match <nmtokens>
+   * - t = A_notation _: v must match <name>
+   * - t = A_enum _: v must match <nmtoken>
+   * - t = A_cdata: not checked
+   *)
+  let lexbuf = Lexing.from_string v in
+  let rec get_name_list() =
+    match lexerset.scan_name_string lexbuf with
+       Eof    -> []
+      | Ignore -> get_name_list()
+      | tok    -> tok :: get_name_list()
+  in
+  let l = get_name_list() in
+  match t with
+      (A_id | A_idref | A_entity | A_notation _) ->
+       begin match l with
+           [ Name n ] -> ()
+         | _          -> raise (Lazy.force x)
+       end
+    | (A_idrefs | A_entities) ->
+       if List.exists (fun tok -> 
+                         match tok with
+                             Name _ -> false
+                           | _ -> true) l then
+         raise (Lazy.force x)
+    | (A_nmtoken | A_enum _) ->
+       begin match l with
+           [ Name n ]      -> ()
+         | [ Nametoken n ] -> ()
+         | _               -> raise (Lazy.force x)
+       end
+    | A_nmtokens ->
+       if List.exists (fun tok -> 
+                         match tok with
+                             Name _ -> false
+                           | Nametoken _ -> false
+                           | _ -> true
+                      ) l then
+         raise (Lazy.force x)
+    | _ -> ()
+;;
+
+
+let split_attribute_value lexerset v =
+  (* splits 'v' into a list of names or nmtokens. The white space separating
+   * the names/nmtokens in 'v' is suppressed and not returned.
+   *)
+  let lexbuf = Lexing.from_string v in
+  let rec get_name_list() =
+    match lexerset.scan_name_string lexbuf with
+       Eof         -> []
+      | Ignore      -> get_name_list()
+      | Name s      -> s :: get_name_list()
+      | Nametoken s -> s :: get_name_list()
+      | _           -> raise(Validation_error("Illegal attribute value"))
+  in
+  get_name_list()
+;;
+
+
+let normalize_line_separators lexerset s =
+  let lexbuf = Lexing.from_string s in
+  let rec get_string() =
+    match lexerset.scan_for_crlf lexbuf with
+       Eof        -> ""
+      | CharData s -> s ^ get_string()
+      | _          -> assert false
+  in
+  get_string()
+;;
+
+
+let value_of_attribute lexerset dtd n atype v =
+  (* The attribute with name 'n', type 'atype' and string value 'v' is
+   * decomposed, and the att_value is returned:
+   * - It is checked whether 'v' conforms to the lexical rules for attributes
+   *   of type 'atype'
+   * - If 'atype <> A_cdata', leading and trailing spaces are removed from 'v'.
+   * - If 'atype = A_notation d', it is checked if 'v' matches one of the
+   *   notation names contained in d.
+   * - If 'atype = A_enum d', it is checked whether 'v' matches one of the
+   *   tokens from d
+   * - If 'atype' refers to a "single-value" type, the value is retured as
+   *   Value u, where u is the normalized value. If 'atype' refers to a 
+   *   "list" type, the value if returned as Valuelist l, where l contains
+   *   the tokens.
+   *
+   * Note that this function does not implement all normalization rules.
+   * It is expected that the string passed as 'v' is already preprocessed;
+   * i.e. character and entity references are resolved, and the substitution
+   * of white space characters by space characters has already been performed.
+   * If these requirements are met, the value returned by this function
+   * will be perfectly normalized.
+   *
+   * Further checks:
+   * - ENTITY and ENTITIES values: It is checked whether there is an
+   *   unparsed general entity
+   * [ Other checks planned: ID, IDREF, IDREFS but not yet implemented ]
+   *)
+
+  let lexical_error() =
+    lazy (raise(Validation_error("Attribute `" ^ n ^ "' is lexically malformed"))) in
+
+  let remove_leading_and_trailing_spaces u =
+    (* Precondition: 'u' matches <name> or <nmtoken> *)
+    match split_attribute_value lexerset u with
+       [ u' ] -> u'
+      | _      -> assert false
+  in
+
+  let check_ndata_entity u =
+    let en, extdecl = dtd # gen_entity u in  (* or Validation_error *)
+    if not (en # is_ndata) then
+      raise(Validation_error("Reference to entity `" ^ u ^ 
+                            "': NDATA entity expected"));
+    if dtd # standalone_declaration && extdecl then
+      raise(Validation_error("Reference to entity `" ^ u ^ 
+                            "' violates standalone declaration"));
+  in
+
+  match atype with
+      A_cdata ->
+       Value v
+
+    | (A_id | A_idref | A_nmtoken) ->
+       check_attribute_value_lexically lexerset (lexical_error()) atype v;
+       Value (remove_leading_and_trailing_spaces v)
+    | A_entity ->
+       check_attribute_value_lexically lexerset (lexical_error()) atype v;
+       let v' = remove_leading_and_trailing_spaces v in
+       check_ndata_entity v';
+       Value v'
+
+    | (A_idrefs | A_nmtokens) ->
+       check_attribute_value_lexically lexerset (lexical_error()) atype v;
+       Valuelist (split_attribute_value lexerset v)
+
+    | A_entities ->
+       check_attribute_value_lexically lexerset (lexical_error()) atype v;
+       let l = split_attribute_value lexerset v in
+       List.iter check_ndata_entity l;
+       Valuelist l
+
+    | A_notation nl ->
+       check_attribute_value_lexically lexerset (lexical_error()) atype v;
+       let v' = remove_leading_and_trailing_spaces v in
+       if not (List.mem v' nl) then
+         raise(Validation_error
+                 ("Attribute `" ^ n ^ 
+                  "' does not match one of the declared notation names"));
+       Value v'
+
+    | A_enum enuml ->
+       check_attribute_value_lexically lexerset (lexical_error()) atype v;
+       let v' = remove_leading_and_trailing_spaces v in
+       if not (List.mem v' enuml) then
+         raise(Validation_error
+                 ("Attribute `" ^ n ^ 
+                  "' does not match one of the declared enumerator tokens"));
+       Value v'
+;;
+
+
+let normalization_changes_value lexerset atype v =
+  (* Returns true if:
+   * - 'atype' is a "single-value" type, and the normalization of the string
+   *   value 'v' of this type discards leading and/or trailing spaces
+   * - 'atype' is a "list" type, and the normalization of the string value
+   *   'v' of this type discards leading and/or trailing spaces, or spaces
+   *   separating the tokens of the list (i.e. the normal form is that
+   *   the tokens are separated by exactly one space character).
+   *
+   * Note: It is assumed that TABs, CRs, and LFs in 'v' are already converted
+   * to spaces.
+   *)
+
+  match atype with
+      A_cdata -> 
+       false
+
+    | (A_id | A_idref | A_entity | A_nmtoken | A_notation _ | A_enum _) ->
+       (* Return 'true' if the first or last character is a space.
+        * The following check works for both ISO-8859-1 and UTF-8.
+        *)
+       v <> "" && (v.[0] = ' ' || v.[String.length v - 1] = ' ')
+
+    | (A_idrefs | A_entities | A_nmtokens) ->
+       (* Split the list, and concatenate the tokens as required by
+        * the normal form. Return 'true' if this operation results in 
+        * a different string than 'v'.
+        * This check works for both ISO-8859-1 and UTF-8.
+        *)
+       let l = split_attribute_value lexerset v in
+       let v' = String.concat " " l in
+       v <> v'
+;;
+
+
+(**********************************************************************)
+
+let write_markup_string ~(from_enc:rep_encoding) ~to_enc os s =
+  (* Write the 'from_enc'-encoded string 's' as 'to_enc'-encoded string to
+   * 'os'. All characters are written as they are.
+   *)
+  let s' =
+    if to_enc = (from_enc :> encoding)
+    then s 
+    else recode_string 
+                ~in_enc:(from_enc :> encoding)
+                ~out_enc:to_enc
+                ~subst:(fun n -> 
+                          failwith 
+                            ("Pxp_aux.write_markup_string: Cannot represent " ^
+                             "code point " ^ string_of_int n))
+                s
+  in
+  write os s' 0 (String.length s')
+;;
+
+
+let write_data_string ~(from_enc:rep_encoding) ~to_enc os content =
+  (* Write the 'from_enc'-encoded string 's' as 'to_enc'-encoded string to
+   * 'os'. The characters '&', '<', '>', '"', '%' and every character that
+   * cannot be represented in 'to_enc' are paraphrased as entity reference
+   * "&...;".
+   *)
+  let convert_ascii s =
+    (* Convert the ASCII-encoded string 's'. Note that 'from_enc' is
+     * always ASCII-compatible
+     *)
+    if to_enc = (from_enc :> encoding) 
+    then s
+    else
+      recode_string
+        ~in_enc:(from_enc :> encoding)
+        ~out_enc:to_enc
+        ~subst:(fun n -> assert false)
+       s
+  in
+
+  let write_ascii s =
+    (* Write the ASCII-encoded string 's' *)
+    let s' = convert_ascii s in
+    write os s' 0 (String.length s')
+  in
+      
+  let write_part j l =
+    (* Writes the substring of 'content' beginning at pos 'j' with length 'l'
+     *)
+    if to_enc = (from_enc :> encoding) then
+      write os content j l
+    else begin
+      let s' = recode_string 
+                ~in_enc:(from_enc :> encoding)
+                ~out_enc:to_enc
+                ~subst:(fun n -> 
+                          convert_ascii ("&#" ^ string_of_int n ^ ";"))
+                (String.sub content j l)
+      in
+      write os s' 0 (String.length s')
+    end
+  in
+
+  let i = ref 0 in
+  for k = 0 to String.length content - 1 do
+    match content.[k] with
+       ('&' | '<' | '>' | '"' | '%') as c ->
+         if !i < k then
+           write_part !i (k - !i);
+         begin match c with
+             '&' -> write_ascii "&amp;"
+           | '<' -> write_ascii "&lt;"
+           | '>' -> write_ascii "&gt;"
+           | '"' -> write_ascii "&quot;"
+           | '%' -> write_ascii "&#37;"  (* reserved in DTDs *)
+           | _   -> assert false
+         end;
+         i := k+1
+      | _ -> ()
+  done;
+  if !i < String.length content then
+    write_part !i (String.length content - !i)
+;;
+
+
+(* ======================================================================
+ * History:
+ * 
+ * $Log$
+ * Revision 1.1  2000/11/17 09:57:29  lpadovan
+ * Initial revision
+ *
+ * Revision 1.6  2000/08/14 22:24:55  gerd
+ *     Moved the module Pxp_encoding to the netstring package under
+ * the new name Netconversion.
+ *
+ * Revision 1.5  2000/07/25 00:30:01  gerd
+ *     Added support for pxp:dtd PI options.
+ *
+ * Revision 1.4  2000/07/16 18:31:09  gerd
+ *     The exception Illegal_character has been dropped.
+ *
+ * Revision 1.3  2000/07/16 16:33:57  gerd
+ *     New function write_markup_string: Handles the encoding
+ * of the string.
+ *
+ * Revision 1.2  2000/07/08 22:15:45  gerd
+ *     [Merging 0.2.10:] write_data_string: The character '%' is special, too.
+ *
+ * Revision 1.1  2000/05/29 23:48:38  gerd
+ *     Changed module names:
+ *             Markup_aux          into Pxp_aux
+ *             Markup_codewriter   into Pxp_codewriter
+ *             Markup_document     into Pxp_document
+ *             Markup_dtd          into Pxp_dtd
+ *             Markup_entity       into Pxp_entity
+ *             Markup_lexer_types  into Pxp_lexer_types
+ *             Markup_reader       into Pxp_reader
+ *             Markup_types        into Pxp_types
+ *             Markup_yacc         into Pxp_yacc
+ * See directory "compatibility" for (almost) compatible wrappers emulating
+ * Markup_document, Markup_dtd, Markup_reader, Markup_types, and Markup_yacc.
+ *
+ * ======================================================================
+ * Old logs from markup_aux.ml:
+ *
+ * Revision 1.12  2000/05/27 19:08:30  gerd
+ *     Added functionality to check standalone declaration:
+ *
+ *     expand_attvalue: Checks whether included entities violate the
+ * stand-alone declaration.
+ *
+ *     value_of_attribute: Checks whether ENTITY/ENTITIES values violate
+ * this declaration. (Furthermore, it is checked whether the NDATA
+ * entity exists - this has been forgotten in previous versions.)
+ *
+ *     value_of_attribute/check_attribute_value_lexically: improved.
+ *
+ *     New function normalization_changes_value: helps detecting
+ * one case which violates the standalone declaration.
+ *
+ * Revision 1.11  2000/05/20 20:31:40  gerd
+ *     Big change: Added support for various encodings of the
+ * internal representation.
+ *
+ * Revision 1.10  2000/05/01 20:41:56  gerd
+ *     New function write_data_string.
+ *
+ * Revision 1.9  2000/04/30 18:11:31  gerd
+ *     New function normalize_line_separators.
+ *     In function expand_attvalue: New argument norm_crlf. If the attvalue
+ * is read directly from a file, the sequence CR LF must be converted to a
+ * single space. If the attvalue is read from a replacement text, CR LF has
+ * already converted to a single LF, and CR LF, if still occurring, must be
+ * converted to two spaces. The caller can indicate the case by passing
+ * true/false as norm_crlf.
+ *
+ * Revision 1.8  1999/09/01 22:51:07  gerd
+ *     Added functions.
+ *     'character' raises Illegal_character if characters are found that
+ * do not match the production Char.
+ *
+ * Revision 1.7  1999/09/01 16:17:37  gerd
+ *     Added function 'check_name'.
+ *
+ * Revision 1.6  1999/08/15 20:33:19  gerd
+ *     Added: a function that checks public identifiers. Only certain
+ * characters may occur in these identifiers.
+ *     Control characters are rejected by the "character" function.
+ *     Bugfix: recursive entity references are detected in attribute
+ * expansion
+ *
+ * Revision 1.5  1999/08/15 02:18:02  gerd
+ *     That '<' is not allowed in attribute values, is a violation
+ * of well-formedness, not of the validity; so WF_error is raised.
+ *
+ * Revision 1.4  1999/08/15 00:20:37  gerd
+ *     When expanding attribute values, references to parameter
+ * entities are now resolved by the method "replacement_text" which
+ * has an additional return value, and no longer by "attlist_replacement_text".
+ * The new return value indicates whether references to external entities
+ * have been resolved (directly or indirectly); this is allowed at some
+ * locations but not in attribute values.
+ *
+ * Revision 1.3  1999/08/14 22:05:53  gerd
+ *     Several functions have now a "warner" as argument which is
+ * an object with a "warn" method. This is used to warn about characters
+ * that cannot be represented in the Latin 1 alphabet.
+ *
+ * Revision 1.2  1999/08/10 21:35:06  gerd
+ *     The XML/encoding declaration at the beginning of entities is
+ * evaluated. In particular, entities have now a method "xml_declaration"
+ * which returns the name/value pairs of such a declaration. The "encoding"
+ * setting is interpreted by the entity itself; "version", and "standalone"
+ * are interpreted by Markup_yacc.parse_document_entity. Other settings
+ * are ignored (this does not conform to the standard; the standard prescribes
+ * that "version" MUST be given in the declaration of document; "standalone"
+ * and "encoding" CAN be declared; no other settings are allowed).
+ *     TODO: The user should be warned if the standard is not exactly
+ * fulfilled. -- The "standalone" property is not checked yet.
+ *
+ * Revision 1.1  1999/08/10 00:35:50  gerd
+ *     Initial revision.
+ *
+ * 
+ *)