--- /dev/null
+(* $Id$
+ * ----------------------------------------------------------------------
+ * PXP: The polymorphic XML parser for Objective Caml.
+ * Copyright by Gerd Stolpmann. See LICENSE for details.
+ *)
+
+open Pxp_types;;
+exception Not_competent;;
+exception Not_resolvable of exn;;
+
+class type resolver =
+ object
+ method init_rep_encoding : rep_encoding -> unit
+ method init_warner : collect_warnings -> unit
+ method rep_encoding : rep_encoding
+ method open_in : ext_id -> Lexing.lexbuf
+ method close_in : unit
+ method close_all : unit
+ method change_encoding : string -> unit
+ method clone : resolver
+ end
+;;
+
+
+class virtual resolve_general
+ =
+ object (self)
+ val mutable internal_encoding = `Enc_utf8
+
+ val mutable encoding = `Enc_utf8
+ val mutable encoding_requested = false
+
+ val mutable warner = new drop_warnings
+
+ val mutable enc_initialized = false
+ val mutable wrn_initialized = false
+
+ val mutable clones = []
+
+ method init_rep_encoding e =
+ internal_encoding <- e;
+ enc_initialized <- true;
+
+ method init_warner w =
+ warner <- w;
+ wrn_initialized <- true;
+
+ method rep_encoding = (internal_encoding :> rep_encoding)
+
+(*
+ method clone =
+ ( {< encoding = `Enc_utf8;
+ encoding_requested = false;
+ >}
+ : # resolver :> resolver )
+*)
+
+ method private warn (k:int) =
+ (* Called if a character not representable has been found.
+ * k is the character code.
+ *)
+ if k < 0xd800 or (k >= 0xe000 & k <= 0xfffd) or
+ (k >= 0x10000 & k <= 0x10ffff) then begin
+ warner # warn ("Code point cannot be represented: " ^ string_of_int k);
+ end
+ else
+ raise (WF_error("Code point " ^ string_of_int k ^
+ " outside the accepted range of code points"))
+
+
+ method private autodetect s =
+ (* s must be at least 4 bytes long. The slot 'encoding' is
+ * set to:
+ * "UTF-16-BE": UTF-16/UCS-2 encoding big endian
+ * "UTF-16-LE": UTF-16/UCS-2 encoding little endian
+ * "UTF-8": UTF-8 encoding
+ *)
+ if String.length s < 4 then
+ encoding <- `Enc_utf8
+ else if String.sub s 0 2 = "\254\255" then
+ encoding <- `Enc_utf16
+ (* Note: Netconversion.recode will detect the big endianess, too *)
+ else if String.sub s 0 2 = "\255\254" then
+ encoding <- `Enc_utf16
+ (* Note: Netconversion.recode will detect the little endianess, too *)
+ else
+ encoding <- `Enc_utf8
+
+
+ method private virtual next_string : string -> int -> int -> int
+ method private virtual init_in : ext_id -> unit
+ method virtual close_in : unit
+
+ method close_all =
+ List.iter (fun r -> r # close_in) clones
+
+ method open_in xid =
+ assert(enc_initialized && wrn_initialized);
+
+ encoding <- `Enc_utf8;
+ encoding_requested <- false;
+ self # init_in xid; (* may raise Not_competent *)
+ (* init_in: may already set 'encoding' *)
+
+ let buffer_max = 512 in
+ let buffer = String.make buffer_max ' ' in
+ let buffer_len = ref 0 in
+ let buffer_end = ref false in
+ let fillup () =
+ if not !buffer_end & !buffer_len < buffer_max then begin
+ let l =
+ self # next_string buffer !buffer_len (buffer_max - !buffer_len) in
+ if l = 0 then
+ buffer_end := true
+ else begin
+ buffer_len := !buffer_len + l
+ end
+ end
+ in
+ let consume n =
+ let l = !buffer_len - n in
+ String.blit buffer n buffer 0 l;
+ buffer_len := l
+ in
+
+ fillup();
+ if not encoding_requested then self # autodetect buffer;
+
+ Lexing.from_function
+ (fun s n ->
+ (* TODO: if encoding = internal_encoding, it is possible to
+ * avoid copying buffer to s because s can be directly used
+ * as buffer.
+ *)
+
+ fillup();
+ if !buffer_len = 0 then
+ 0
+ else begin
+ let m_in = !buffer_len in
+ let m_max = if encoding_requested then n else 1 in
+ let n_in, n_out, encoding' =
+ if encoding = (internal_encoding : rep_encoding :> encoding) &&
+ encoding_requested
+ then begin
+ (* Special case encoding = internal_encoding *)
+ String.blit buffer 0 s 0 m_in;
+ m_in, m_in, encoding
+ end
+ else
+ Netconversion.recode
+ ~in_enc:encoding
+ ~in_buf:buffer
+ ~in_pos:0
+ ~in_len:m_in
+ ~out_enc:(internal_encoding : rep_encoding :> encoding)
+ ~out_buf:s
+ ~out_pos:0
+ ~out_len:n
+ ~max_chars:m_max
+ ~subst:(fun k -> self # warn k; "")
+ in
+ if n_in = 0 then
+ (* An incomplete character at the end of the stream: *)
+ raise Netconversion.Malformed_code;
+ (* failwith "Badly encoded character"; *)
+ encoding <- encoding';
+ consume n_in;
+ assert(n_out <> 0);
+ n_out
+ end)
+
+ method change_encoding enc =
+ if not encoding_requested then begin
+ if enc <> "" then begin
+ match Netconversion.encoding_of_string enc with
+ `Enc_utf16 ->
+ (match encoding with
+ (`Enc_utf16_le | `Enc_utf16_be) -> ()
+ | `Enc_utf16 -> assert false
+ | _ ->
+ raise(WF_error "Encoding of data stream and encoding declaration mismatch")
+ )
+ | e ->
+ encoding <- e
+ end;
+ (* else: the autodetected encoding counts *)
+ encoding_requested <- true;
+ end;
+ end
+;;
+
+
+class resolve_read_any_channel ?(auto_close=true) ~channel_of_id =
+ object (self)
+ inherit resolve_general as super
+
+ val f_open = channel_of_id
+ val mutable current_channel = None
+ val auto_close = auto_close
+
+ method private init_in (id:ext_id) =
+ if current_channel <> None then
+ failwith "Pxp_reader.resolve_read_any_channel # init_in";
+ let ch, enc_opt = f_open id in (* may raise Not_competent *)
+ begin match enc_opt with
+ None -> ()
+ | Some enc -> encoding <- enc; encoding_requested <- true
+ end;
+ current_channel <- Some ch;
+
+ method private next_string s ofs len =
+ match current_channel with
+ None -> failwith "Pxp_reader.resolve_read_any_channel # next_string"
+ | Some ch ->
+ input ch s ofs len
+
+ method close_in =
+ match current_channel with
+ None -> ()
+ | Some ch ->
+ if auto_close then close_in ch;
+ current_channel <- None
+
+ method clone =
+ let c = new resolve_read_any_channel
+ ?auto_close:(Some auto_close) f_open in
+ c # init_rep_encoding internal_encoding;
+ c # init_warner warner;
+ clones <- c :: clones;
+ (c :> resolver)
+
+ end
+;;
+
+
+class resolve_read_this_channel1 is_stale ?id ?fixenc ?auto_close ch =
+
+ let getchannel = ref (fun xid -> assert false) in
+
+ object (self)
+ inherit resolve_read_any_channel
+ ?auto_close:auto_close
+ (fun xid -> !getchannel xid)
+ as super
+
+ val mutable is_stale = is_stale
+ (* The channel can only be read once. To avoid that the channel
+ * is opened several times, the flag 'is_stale' is set after the
+ * first time.
+ *)
+
+ val fixid = id
+ val fixenc = fixenc
+ val fixch = ch
+
+ initializer
+ getchannel := self # getchannel
+
+ method private getchannel xid =
+ begin match fixid with
+ None -> ()
+ | Some bound_xid ->
+ if xid <> bound_xid then raise Not_competent
+ end;
+ ch, fixenc
+
+ method private init_in (id:ext_id) =
+ if is_stale then
+ raise Not_competent
+ else begin
+ super # init_in id;
+ is_stale <- true
+ end
+
+ method close_in =
+ current_channel <- None
+
+ method clone =
+ let c = new resolve_read_this_channel1
+ is_stale
+ ?id:fixid ?fixenc:fixenc ?auto_close:(Some auto_close) fixch
+ in
+ c # init_rep_encoding internal_encoding;
+ c # init_warner warner;
+ clones <- c :: clones;
+ (c :> resolver)
+
+ end
+;;
+
+
+class resolve_read_this_channel =
+ resolve_read_this_channel1 false
+;;
+
+
+class resolve_read_any_string ~string_of_id =
+ object (self)
+ inherit resolve_general as super
+
+ val f_open = string_of_id
+ val mutable current_string = None
+ val mutable current_pos = 0
+
+ method private init_in (id:ext_id) =
+ if current_string <> None then
+ failwith "Pxp_reader.resolve_read_any_string # init_in";
+ let s, enc_opt = f_open id in (* may raise Not_competent *)
+ begin match enc_opt with
+ None -> ()
+ | Some enc -> encoding <- enc; encoding_requested <- true
+ end;
+ current_string <- Some s;
+ current_pos <- 0;
+
+ method private next_string s ofs len =
+ match current_string with
+ None -> failwith "Pxp_reader.resolve_read_any_string # next_string"
+ | Some str ->
+ let l = min len (String.length str - current_pos) in
+ String.blit str current_pos s ofs l;
+ current_pos <- current_pos + l;
+ l
+
+ method close_in =
+ match current_string with
+ None -> ()
+ | Some _ ->
+ current_string <- None
+
+ method clone =
+ let c = new resolve_read_any_string f_open in
+ c # init_rep_encoding internal_encoding;
+ c # init_warner warner;
+ clones <- c :: clones;
+ (c :> resolver)
+ end
+;;
+
+
+class resolve_read_this_string1 is_stale ?id ?fixenc str =
+
+ let getstring = ref (fun xid -> assert false) in
+
+ object (self)
+ inherit resolve_read_any_string (fun xid -> !getstring xid) as super
+
+ val is_stale = is_stale
+ (* For some reasons, it is not allowed to open a clone of the resolver
+ * a second time when the original resolver is already open.
+ *)
+
+ val fixid = id
+ val fixenc = fixenc
+ val fixstr = str
+
+ initializer
+ getstring := self # getstring
+
+ method private getstring xid =
+ begin match fixid with
+ None -> ()
+ | Some bound_xid ->
+ if xid <> bound_xid then raise Not_competent
+ end;
+ fixstr, fixenc
+
+
+ method private init_in (id:ext_id) =
+ if is_stale then
+ raise Not_competent
+ else
+ super # init_in id
+
+ method clone =
+ let c = new resolve_read_this_string1
+ (is_stale or current_string <> None)
+ ?id:fixid ?fixenc:fixenc fixstr
+ in
+ c # init_rep_encoding internal_encoding;
+ c # init_warner warner;
+ clones <- c :: clones;
+ (c :> resolver)
+ end
+;;
+
+
+class resolve_read_this_string =
+ resolve_read_this_string1 false
+;;
+
+
+class resolve_read_url_channel
+ ?(base_url = Neturl.null_url)
+ ?auto_close
+ ~url_of_id
+ ~channel_of_url
+
+ : resolver
+ =
+
+ let getchannel = ref (fun xid -> assert false) in
+
+ object (self)
+ inherit resolve_read_any_channel
+ ?auto_close:auto_close
+ (fun xid -> !getchannel xid)
+ as super
+
+ val base_url = base_url
+ val mutable own_url = Neturl.null_url
+
+ val url_of_id = url_of_id
+ val channel_of_url = channel_of_url
+
+
+ initializer
+ getchannel := self # getchannel
+
+ method private getchannel xid =
+ let rel_url = url_of_id xid in (* may raise Not_competent *)
+
+ try
+ (* Now compute the absolute URL: *)
+ let abs_url = Neturl.apply_relative_url base_url rel_url in
+ (* may raise Malformed_URL *)
+
+ (* Simple check whether 'abs_url' is really absolute: *)
+ if not(Neturl.url_provides ~scheme:true abs_url)
+ then raise Not_competent;
+
+ own_url <- abs_url;
+ (* FIXME: Copy 'abs_url' ? *)
+
+ (* Get and return the channel: *)
+ channel_of_url abs_url (* may raise Not_competent *)
+ with
+ Neturl.Malformed_URL -> raise (Not_resolvable Neturl.Malformed_URL)
+ | Not_competent -> raise (Not_resolvable Not_found)
+
+ method clone =
+ let c =
+ new resolve_read_url_channel
+ ?base_url:(Some own_url)
+ ?auto_close:(Some auto_close)
+ ~url_of_id:url_of_id
+ ~channel_of_url:channel_of_url
+ in
+ c # init_rep_encoding internal_encoding;
+ c # init_warner warner;
+ clones <- c :: clones;
+ (c :> resolve_read_url_channel)
+ end
+;;
+
+
+type spec = [ `Not_recognized | `Allowed | `Required ]
+
+class resolve_as_file
+ ?(file_prefix = (`Allowed :> spec))
+ ?(host_prefix = (`Allowed :> spec))
+ ?(system_encoding = `Enc_utf8)
+ ?url_of_id:passed_url_of_id
+ ?channel_of_url:passed_channel_of_url
+ ()
+ =
+
+ let url_syntax =
+ let enable_if =
+ function
+ `Not_recognized -> Neturl.Url_part_not_recognized
+ | `Allowed -> Neturl.Url_part_allowed
+ | `Required -> Neturl.Url_part_required
+ in
+ { Neturl.null_url_syntax with
+ Neturl.url_enable_scheme = enable_if file_prefix;
+ Neturl.url_enable_host = enable_if host_prefix;
+ Neturl.url_enable_path = Neturl.Url_part_required;
+ Neturl.url_accepts_8bits = true;
+ }
+ in
+
+ let base_url_syntax =
+ { Neturl.null_url_syntax with
+ Neturl.url_enable_scheme = Neturl.Url_part_required;
+ Neturl.url_enable_host = Neturl.Url_part_allowed;
+ Neturl.url_enable_path = Neturl.Url_part_required;
+ Neturl.url_accepts_8bits = true;
+ }
+ in
+
+ let default_base_url =
+ Neturl.make_url
+ ~scheme: "file"
+ ~host: ""
+ ~path: (Neturl.split_path (Sys.getcwd() ^ "/"))
+ base_url_syntax
+ in
+
+ let file_url_of_id xid =
+ let file_url_of_sysname sysname =
+ (* By convention, we can assume that sysname is a URL conforming
+ * to RFC 1738 with the exception that it may contain non-ASCII
+ * UTF-8 characters.
+ *)
+ try
+ Neturl.url_of_string url_syntax sysname
+ (* may raise Malformed_URL *)
+ with
+ Neturl.Malformed_URL -> raise Not_competent
+ in
+ let url =
+ match xid with
+ Anonymous -> raise Not_competent
+ | Public (_,sysname) -> if sysname <> "" then file_url_of_sysname sysname
+ else raise Not_competent
+ | System sysname -> file_url_of_sysname sysname
+ in
+ let scheme =
+ try Neturl.url_scheme url with Not_found -> "file" in
+ let host =
+ try Neturl.url_host url with Not_found -> "" in
+
+ if scheme <> "file" then raise Not_competent;
+ if host <> "" && host <> "localhost" then raise Not_competent;
+
+ url
+ in
+
+ let channel_of_file_url url =
+ try
+ let path_utf8 =
+ try Neturl.join_path (Neturl.url_path ~encoded:false url)
+ with Not_found -> raise Not_competent
+ in
+
+ let path =
+ Netconversion.recode_string
+ ~in_enc: `Enc_utf8
+ ~out_enc: system_encoding
+ path_utf8 in
+ (* May raise Bad_character_stream *)
+
+ open_in_bin path, None
+ (* May raise Sys_error *)
+
+ with
+ | Netconversion.Malformed_code -> assert false
+ (* should not happen *)
+
+ in
+
+ let url_of_id id =
+ match passed_url_of_id with
+ None ->
+ file_url_of_id id
+ | Some f ->
+ begin
+ try f id
+ with
+ Not_competent -> file_url_of_id id
+ end
+ in
+
+ let channel_of_url url =
+ match passed_channel_of_url with
+ None ->
+ channel_of_file_url url
+ | Some f ->
+ begin
+ try f url
+ with
+ Not_competent -> channel_of_file_url url
+ end
+ in
+
+ resolve_read_url_channel
+ ~base_url: default_base_url
+ ~auto_close: true
+ ~url_of_id: url_of_id
+ ~channel_of_url: channel_of_url
+;;
+
+
+class combine ?prefer rl =
+ object (self)
+ val prefered_resolver = prefer
+ val resolvers = (rl : resolver list)
+ val mutable internal_encoding = `Enc_utf8
+ val mutable warner = new drop_warnings
+ val mutable active_resolver = None
+ val mutable clones = []
+
+ method init_rep_encoding enc =
+ List.iter
+ (fun r -> r # init_rep_encoding enc)
+ rl;
+ internal_encoding <- enc
+
+ method init_warner w =
+ List.iter
+ (fun r -> r # init_warner w)
+ rl;
+ warner <- w;
+
+ method rep_encoding = internal_encoding
+ (* CAUTION: This may not be the truth! *)
+
+ method open_in xid =
+ let rec find_competent_resolver rl =
+ match rl with
+ r :: rl' ->
+ begin try
+ r, (r # open_in xid)
+ with
+ Not_competent -> find_competent_resolver rl'
+ end;
+ | [] ->
+ raise Not_competent
+ in
+
+ if active_resolver <> None then failwith "Pxp_reader.combine # open_in";
+ let r, lb =
+ match prefered_resolver with
+ None -> find_competent_resolver resolvers
+ | Some r -> find_competent_resolver (r :: resolvers)
+ in
+ active_resolver <- Some r;
+ lb
+
+ method close_in =
+ match active_resolver with
+ None -> ()
+ | Some r -> r # close_in;
+ active_resolver <- None
+
+ method close_all =
+ List.iter (fun r -> r # close_in) clones
+
+ method change_encoding (enc:string) =
+ match active_resolver with
+ None -> failwith "Pxp_reader.combine # change_encoding"
+ | Some r -> r # change_encoding enc
+
+ method clone =
+ let c =
+ match active_resolver with
+ None ->
+ new combine ?prefer:None (List.map (fun q -> q # clone) resolvers)
+ | Some r ->
+ let r' = r # clone in
+ new combine
+ ?prefer:(Some r')
+ (List.map
+ (fun q -> if q == r then r' else q # clone)
+ resolvers)
+ in
+ c # init_rep_encoding internal_encoding;
+ c # init_warner warner;
+ clones <- c :: clones;
+ c
+ end
+
+
+
+(* ======================================================================
+ * History:
+ *
+ * $Log$
+ * Revision 1.1 2000/11/17 09:57:29 lpadovan
+ * Initial revision
+ *
+ * Revision 1.9 2000/08/14 22:24:55 gerd
+ * Moved the module Pxp_encoding to the netstring package under
+ * the new name Netconversion.
+ *
+ * Revision 1.8 2000/07/16 18:31:09 gerd
+ * The exception Illegal_character has been dropped.
+ *
+ * Revision 1.7 2000/07/09 15:32:01 gerd
+ * Fix in resolve_this_channel, resolve_this_string
+ *
+ * Revision 1.6 2000/07/09 01:05:33 gerd
+ * New methode 'close_all' that closes the clones, too.
+ *
+ * Revision 1.5 2000/07/08 16:24:56 gerd
+ * Introduced the exception 'Not_resolvable' to indicate that
+ * 'combine' should not try the next resolver of the list.
+ *
+ * Revision 1.4 2000/07/06 23:04:46 gerd
+ * Quick fix for 'combine': The active resolver is "prefered",
+ * but the other resolvers are also used.
+ *
+ * Revision 1.3 2000/07/06 21:43:45 gerd
+ * Fix: Public(_,name) is now treated as System(name) if
+ * name is non-empty.
+ *
+ * Revision 1.2 2000/07/04 22:13:30 gerd
+ * Implemented the new API rev. 1.2 of pxp_reader.mli.
+ *
+ * Revision 1.1 2000/05/29 23:48:38 gerd
+ * Changed module names:
+ * Markup_aux into Pxp_aux
+ * Markup_codewriter into Pxp_codewriter
+ * Markup_document into Pxp_document
+ * Markup_dtd into Pxp_dtd
+ * Markup_entity into Pxp_entity
+ * Markup_lexer_types into Pxp_lexer_types
+ * Markup_reader into Pxp_reader
+ * Markup_types into Pxp_types
+ * Markup_yacc into Pxp_yacc
+ * See directory "compatibility" for (almost) compatible wrappers emulating
+ * Markup_document, Markup_dtd, Markup_reader, Markup_types, and Markup_yacc.
+ *
+ * ======================================================================
+ * Old logs from markup_reader.ml:
+ *
+ * Revision 1.3 2000/05/29 21:14:57 gerd
+ * Changed the type 'encoding' into a polymorphic variant.
+ *
+ * Revision 1.2 2000/05/20 20:31:40 gerd
+ * Big change: Added support for various encodings of the
+ * internal representation.
+ *
+ * Revision 1.1 2000/03/13 23:41:44 gerd
+ * Initial revision; this code was formerly part of Markup_entity.
+ *
+ *
+ *)