X-Git-Url: http://matita.cs.unibo.it/gitweb/?a=blobdiff_plain;f=helm%2FDEVEL%2Fpxp%2Fpxp%2Fpxp_reader.ml;fp=helm%2FDEVEL%2Fpxp%2Fpxp%2Fpxp_reader.ml;h=83add26d5fcc703e3d41898372f44aec5e84aafd;hb=c03d2c1fdab8d228cb88aaba5ca0f556318bebc5;hp=0000000000000000000000000000000000000000;hpb=758057e85325f94cd88583feb1fdf6b038e35055;p=helm.git diff --git a/helm/DEVEL/pxp/pxp/pxp_reader.ml b/helm/DEVEL/pxp/pxp/pxp_reader.ml new file mode 100644 index 000000000..83add26d5 --- /dev/null +++ b/helm/DEVEL/pxp/pxp/pxp_reader.ml @@ -0,0 +1,730 @@ +(* $Id$ + * ---------------------------------------------------------------------- + * PXP: The polymorphic XML parser for Objective Caml. + * Copyright by Gerd Stolpmann. See LICENSE for details. + *) + +open Pxp_types;; +exception Not_competent;; +exception Not_resolvable of exn;; + +class type resolver = + object + method init_rep_encoding : rep_encoding -> unit + method init_warner : collect_warnings -> unit + method rep_encoding : rep_encoding + method open_in : ext_id -> Lexing.lexbuf + method close_in : unit + method close_all : unit + method change_encoding : string -> unit + method clone : resolver + end +;; + + +class virtual resolve_general + = + object (self) + val mutable internal_encoding = `Enc_utf8 + + val mutable encoding = `Enc_utf8 + val mutable encoding_requested = false + + val mutable warner = new drop_warnings + + val mutable enc_initialized = false + val mutable wrn_initialized = false + + val mutable clones = [] + + method init_rep_encoding e = + internal_encoding <- e; + enc_initialized <- true; + + method init_warner w = + warner <- w; + wrn_initialized <- true; + + method rep_encoding = (internal_encoding :> rep_encoding) + +(* + method clone = + ( {< encoding = `Enc_utf8; + encoding_requested = false; + >} + : # resolver :> resolver ) +*) + + method private warn (k:int) = + (* Called if a character not representable has been found. + * k is the character code. + *) + if k < 0xd800 or (k >= 0xe000 & k <= 0xfffd) or + (k >= 0x10000 & k <= 0x10ffff) then begin + warner # warn ("Code point cannot be represented: " ^ string_of_int k); + end + else + raise (WF_error("Code point " ^ string_of_int k ^ + " outside the accepted range of code points")) + + + method private autodetect s = + (* s must be at least 4 bytes long. The slot 'encoding' is + * set to: + * "UTF-16-BE": UTF-16/UCS-2 encoding big endian + * "UTF-16-LE": UTF-16/UCS-2 encoding little endian + * "UTF-8": UTF-8 encoding + *) + if String.length s < 4 then + encoding <- `Enc_utf8 + else if String.sub s 0 2 = "\254\255" then + encoding <- `Enc_utf16 + (* Note: Netconversion.recode will detect the big endianess, too *) + else if String.sub s 0 2 = "\255\254" then + encoding <- `Enc_utf16 + (* Note: Netconversion.recode will detect the little endianess, too *) + else + encoding <- `Enc_utf8 + + + method private virtual next_string : string -> int -> int -> int + method private virtual init_in : ext_id -> unit + method virtual close_in : unit + + method close_all = + List.iter (fun r -> r # close_in) clones + + method open_in xid = + assert(enc_initialized && wrn_initialized); + + encoding <- `Enc_utf8; + encoding_requested <- false; + self # init_in xid; (* may raise Not_competent *) + (* init_in: may already set 'encoding' *) + + let buffer_max = 512 in + let buffer = String.make buffer_max ' ' in + let buffer_len = ref 0 in + let buffer_end = ref false in + let fillup () = + if not !buffer_end & !buffer_len < buffer_max then begin + let l = + self # next_string buffer !buffer_len (buffer_max - !buffer_len) in + if l = 0 then + buffer_end := true + else begin + buffer_len := !buffer_len + l + end + end + in + let consume n = + let l = !buffer_len - n in + String.blit buffer n buffer 0 l; + buffer_len := l + in + + fillup(); + if not encoding_requested then self # autodetect buffer; + + Lexing.from_function + (fun s n -> + (* TODO: if encoding = internal_encoding, it is possible to + * avoid copying buffer to s because s can be directly used + * as buffer. + *) + + fillup(); + if !buffer_len = 0 then + 0 + else begin + let m_in = !buffer_len in + let m_max = if encoding_requested then n else 1 in + let n_in, n_out, encoding' = + if encoding = (internal_encoding : rep_encoding :> encoding) && + encoding_requested + then begin + (* Special case encoding = internal_encoding *) + String.blit buffer 0 s 0 m_in; + m_in, m_in, encoding + end + else + Netconversion.recode + ~in_enc:encoding + ~in_buf:buffer + ~in_pos:0 + ~in_len:m_in + ~out_enc:(internal_encoding : rep_encoding :> encoding) + ~out_buf:s + ~out_pos:0 + ~out_len:n + ~max_chars:m_max + ~subst:(fun k -> self # warn k; "") + in + if n_in = 0 then + (* An incomplete character at the end of the stream: *) + raise Netconversion.Malformed_code; + (* failwith "Badly encoded character"; *) + encoding <- encoding'; + consume n_in; + assert(n_out <> 0); + n_out + end) + + method change_encoding enc = + if not encoding_requested then begin + if enc <> "" then begin + match Netconversion.encoding_of_string enc with + `Enc_utf16 -> + (match encoding with + (`Enc_utf16_le | `Enc_utf16_be) -> () + | `Enc_utf16 -> assert false + | _ -> + raise(WF_error "Encoding of data stream and encoding declaration mismatch") + ) + | e -> + encoding <- e + end; + (* else: the autodetected encoding counts *) + encoding_requested <- true; + end; + end +;; + + +class resolve_read_any_channel ?(auto_close=true) ~channel_of_id = + object (self) + inherit resolve_general as super + + val f_open = channel_of_id + val mutable current_channel = None + val auto_close = auto_close + + method private init_in (id:ext_id) = + if current_channel <> None then + failwith "Pxp_reader.resolve_read_any_channel # init_in"; + let ch, enc_opt = f_open id in (* may raise Not_competent *) + begin match enc_opt with + None -> () + | Some enc -> encoding <- enc; encoding_requested <- true + end; + current_channel <- Some ch; + + method private next_string s ofs len = + match current_channel with + None -> failwith "Pxp_reader.resolve_read_any_channel # next_string" + | Some ch -> + input ch s ofs len + + method close_in = + match current_channel with + None -> () + | Some ch -> + if auto_close then close_in ch; + current_channel <- None + + method clone = + let c = new resolve_read_any_channel + ?auto_close:(Some auto_close) f_open in + c # init_rep_encoding internal_encoding; + c # init_warner warner; + clones <- c :: clones; + (c :> resolver) + + end +;; + + +class resolve_read_this_channel1 is_stale ?id ?fixenc ?auto_close ch = + + let getchannel = ref (fun xid -> assert false) in + + object (self) + inherit resolve_read_any_channel + ?auto_close:auto_close + (fun xid -> !getchannel xid) + as super + + val mutable is_stale = is_stale + (* The channel can only be read once. To avoid that the channel + * is opened several times, the flag 'is_stale' is set after the + * first time. + *) + + val fixid = id + val fixenc = fixenc + val fixch = ch + + initializer + getchannel := self # getchannel + + method private getchannel xid = + begin match fixid with + None -> () + | Some bound_xid -> + if xid <> bound_xid then raise Not_competent + end; + ch, fixenc + + method private init_in (id:ext_id) = + if is_stale then + raise Not_competent + else begin + super # init_in id; + is_stale <- true + end + + method close_in = + current_channel <- None + + method clone = + let c = new resolve_read_this_channel1 + is_stale + ?id:fixid ?fixenc:fixenc ?auto_close:(Some auto_close) fixch + in + c # init_rep_encoding internal_encoding; + c # init_warner warner; + clones <- c :: clones; + (c :> resolver) + + end +;; + + +class resolve_read_this_channel = + resolve_read_this_channel1 false +;; + + +class resolve_read_any_string ~string_of_id = + object (self) + inherit resolve_general as super + + val f_open = string_of_id + val mutable current_string = None + val mutable current_pos = 0 + + method private init_in (id:ext_id) = + if current_string <> None then + failwith "Pxp_reader.resolve_read_any_string # init_in"; + let s, enc_opt = f_open id in (* may raise Not_competent *) + begin match enc_opt with + None -> () + | Some enc -> encoding <- enc; encoding_requested <- true + end; + current_string <- Some s; + current_pos <- 0; + + method private next_string s ofs len = + match current_string with + None -> failwith "Pxp_reader.resolve_read_any_string # next_string" + | Some str -> + let l = min len (String.length str - current_pos) in + String.blit str current_pos s ofs l; + current_pos <- current_pos + l; + l + + method close_in = + match current_string with + None -> () + | Some _ -> + current_string <- None + + method clone = + let c = new resolve_read_any_string f_open in + c # init_rep_encoding internal_encoding; + c # init_warner warner; + clones <- c :: clones; + (c :> resolver) + end +;; + + +class resolve_read_this_string1 is_stale ?id ?fixenc str = + + let getstring = ref (fun xid -> assert false) in + + object (self) + inherit resolve_read_any_string (fun xid -> !getstring xid) as super + + val is_stale = is_stale + (* For some reasons, it is not allowed to open a clone of the resolver + * a second time when the original resolver is already open. + *) + + val fixid = id + val fixenc = fixenc + val fixstr = str + + initializer + getstring := self # getstring + + method private getstring xid = + begin match fixid with + None -> () + | Some bound_xid -> + if xid <> bound_xid then raise Not_competent + end; + fixstr, fixenc + + + method private init_in (id:ext_id) = + if is_stale then + raise Not_competent + else + super # init_in id + + method clone = + let c = new resolve_read_this_string1 + (is_stale or current_string <> None) + ?id:fixid ?fixenc:fixenc fixstr + in + c # init_rep_encoding internal_encoding; + c # init_warner warner; + clones <- c :: clones; + (c :> resolver) + end +;; + + +class resolve_read_this_string = + resolve_read_this_string1 false +;; + + +class resolve_read_url_channel + ?(base_url = Neturl.null_url) + ?auto_close + ~url_of_id + ~channel_of_url + + : resolver + = + + let getchannel = ref (fun xid -> assert false) in + + object (self) + inherit resolve_read_any_channel + ?auto_close:auto_close + (fun xid -> !getchannel xid) + as super + + val base_url = base_url + val mutable own_url = Neturl.null_url + + val url_of_id = url_of_id + val channel_of_url = channel_of_url + + + initializer + getchannel := self # getchannel + + method private getchannel xid = + let rel_url = url_of_id xid in (* may raise Not_competent *) + + try + (* Now compute the absolute URL: *) + let abs_url = Neturl.apply_relative_url base_url rel_url in + (* may raise Malformed_URL *) + + (* Simple check whether 'abs_url' is really absolute: *) + if not(Neturl.url_provides ~scheme:true abs_url) + then raise Not_competent; + + own_url <- abs_url; + (* FIXME: Copy 'abs_url' ? *) + + (* Get and return the channel: *) + channel_of_url abs_url (* may raise Not_competent *) + with + Neturl.Malformed_URL -> raise (Not_resolvable Neturl.Malformed_URL) + | Not_competent -> raise (Not_resolvable Not_found) + + method clone = + let c = + new resolve_read_url_channel + ?base_url:(Some own_url) + ?auto_close:(Some auto_close) + ~url_of_id:url_of_id + ~channel_of_url:channel_of_url + in + c # init_rep_encoding internal_encoding; + c # init_warner warner; + clones <- c :: clones; + (c :> resolve_read_url_channel) + end +;; + + +type spec = [ `Not_recognized | `Allowed | `Required ] + +class resolve_as_file + ?(file_prefix = (`Allowed :> spec)) + ?(host_prefix = (`Allowed :> spec)) + ?(system_encoding = `Enc_utf8) + ?url_of_id:passed_url_of_id + ?channel_of_url:passed_channel_of_url + () + = + + let url_syntax = + let enable_if = + function + `Not_recognized -> Neturl.Url_part_not_recognized + | `Allowed -> Neturl.Url_part_allowed + | `Required -> Neturl.Url_part_required + in + { Neturl.null_url_syntax with + Neturl.url_enable_scheme = enable_if file_prefix; + Neturl.url_enable_host = enable_if host_prefix; + Neturl.url_enable_path = Neturl.Url_part_required; + Neturl.url_accepts_8bits = true; + } + in + + let base_url_syntax = + { Neturl.null_url_syntax with + Neturl.url_enable_scheme = Neturl.Url_part_required; + Neturl.url_enable_host = Neturl.Url_part_allowed; + Neturl.url_enable_path = Neturl.Url_part_required; + Neturl.url_accepts_8bits = true; + } + in + + let default_base_url = + Neturl.make_url + ~scheme: "file" + ~host: "" + ~path: (Neturl.split_path (Sys.getcwd() ^ "/")) + base_url_syntax + in + + let file_url_of_id xid = + let file_url_of_sysname sysname = + (* By convention, we can assume that sysname is a URL conforming + * to RFC 1738 with the exception that it may contain non-ASCII + * UTF-8 characters. + *) + try + Neturl.url_of_string url_syntax sysname + (* may raise Malformed_URL *) + with + Neturl.Malformed_URL -> raise Not_competent + in + let url = + match xid with + Anonymous -> raise Not_competent + | Public (_,sysname) -> if sysname <> "" then file_url_of_sysname sysname + else raise Not_competent + | System sysname -> file_url_of_sysname sysname + in + let scheme = + try Neturl.url_scheme url with Not_found -> "file" in + let host = + try Neturl.url_host url with Not_found -> "" in + + if scheme <> "file" then raise Not_competent; + if host <> "" && host <> "localhost" then raise Not_competent; + + url + in + + let channel_of_file_url url = + try + let path_utf8 = + try Neturl.join_path (Neturl.url_path ~encoded:false url) + with Not_found -> raise Not_competent + in + + let path = + Netconversion.recode_string + ~in_enc: `Enc_utf8 + ~out_enc: system_encoding + path_utf8 in + (* May raise Bad_character_stream *) + + open_in_bin path, None + (* May raise Sys_error *) + + with + | Netconversion.Malformed_code -> assert false + (* should not happen *) + + in + + let url_of_id id = + match passed_url_of_id with + None -> + file_url_of_id id + | Some f -> + begin + try f id + with + Not_competent -> file_url_of_id id + end + in + + let channel_of_url url = + match passed_channel_of_url with + None -> + channel_of_file_url url + | Some f -> + begin + try f url + with + Not_competent -> channel_of_file_url url + end + in + + resolve_read_url_channel + ~base_url: default_base_url + ~auto_close: true + ~url_of_id: url_of_id + ~channel_of_url: channel_of_url +;; + + +class combine ?prefer rl = + object (self) + val prefered_resolver = prefer + val resolvers = (rl : resolver list) + val mutable internal_encoding = `Enc_utf8 + val mutable warner = new drop_warnings + val mutable active_resolver = None + val mutable clones = [] + + method init_rep_encoding enc = + List.iter + (fun r -> r # init_rep_encoding enc) + rl; + internal_encoding <- enc + + method init_warner w = + List.iter + (fun r -> r # init_warner w) + rl; + warner <- w; + + method rep_encoding = internal_encoding + (* CAUTION: This may not be the truth! *) + + method open_in xid = + let rec find_competent_resolver rl = + match rl with + r :: rl' -> + begin try + r, (r # open_in xid) + with + Not_competent -> find_competent_resolver rl' + end; + | [] -> + raise Not_competent + in + + if active_resolver <> None then failwith "Pxp_reader.combine # open_in"; + let r, lb = + match prefered_resolver with + None -> find_competent_resolver resolvers + | Some r -> find_competent_resolver (r :: resolvers) + in + active_resolver <- Some r; + lb + + method close_in = + match active_resolver with + None -> () + | Some r -> r # close_in; + active_resolver <- None + + method close_all = + List.iter (fun r -> r # close_in) clones + + method change_encoding (enc:string) = + match active_resolver with + None -> failwith "Pxp_reader.combine # change_encoding" + | Some r -> r # change_encoding enc + + method clone = + let c = + match active_resolver with + None -> + new combine ?prefer:None (List.map (fun q -> q # clone) resolvers) + | Some r -> + let r' = r # clone in + new combine + ?prefer:(Some r') + (List.map + (fun q -> if q == r then r' else q # clone) + resolvers) + in + c # init_rep_encoding internal_encoding; + c # init_warner warner; + clones <- c :: clones; + c + end + + + +(* ====================================================================== + * History: + * + * $Log$ + * Revision 1.1 2000/11/17 09:57:29 lpadovan + * Initial revision + * + * Revision 1.9 2000/08/14 22:24:55 gerd + * Moved the module Pxp_encoding to the netstring package under + * the new name Netconversion. + * + * Revision 1.8 2000/07/16 18:31:09 gerd + * The exception Illegal_character has been dropped. + * + * Revision 1.7 2000/07/09 15:32:01 gerd + * Fix in resolve_this_channel, resolve_this_string + * + * Revision 1.6 2000/07/09 01:05:33 gerd + * New methode 'close_all' that closes the clones, too. + * + * Revision 1.5 2000/07/08 16:24:56 gerd + * Introduced the exception 'Not_resolvable' to indicate that + * 'combine' should not try the next resolver of the list. + * + * Revision 1.4 2000/07/06 23:04:46 gerd + * Quick fix for 'combine': The active resolver is "prefered", + * but the other resolvers are also used. + * + * Revision 1.3 2000/07/06 21:43:45 gerd + * Fix: Public(_,name) is now treated as System(name) if + * name is non-empty. + * + * Revision 1.2 2000/07/04 22:13:30 gerd + * Implemented the new API rev. 1.2 of pxp_reader.mli. + * + * Revision 1.1 2000/05/29 23:48:38 gerd + * Changed module names: + * Markup_aux into Pxp_aux + * Markup_codewriter into Pxp_codewriter + * Markup_document into Pxp_document + * Markup_dtd into Pxp_dtd + * Markup_entity into Pxp_entity + * Markup_lexer_types into Pxp_lexer_types + * Markup_reader into Pxp_reader + * Markup_types into Pxp_types + * Markup_yacc into Pxp_yacc + * See directory "compatibility" for (almost) compatible wrappers emulating + * Markup_document, Markup_dtd, Markup_reader, Markup_types, and Markup_yacc. + * + * ====================================================================== + * Old logs from markup_reader.ml: + * + * Revision 1.3 2000/05/29 21:14:57 gerd + * Changed the type 'encoding' into a polymorphic variant. + * + * Revision 1.2 2000/05/20 20:31:40 gerd + * Big change: Added support for various encodings of the + * internal representation. + * + * Revision 1.1 2000/03/13 23:41:44 gerd + * Initial revision; this code was formerly part of Markup_entity. + * + * + *)