2 * ----------------------------------------------------------------------
3 * PXP: The polymorphic XML parser for Objective Caml.
4 * Copyright by Gerd Stolpmann. See LICENSE for details.
8 exception Not_competent;;
9 exception Not_resolvable of exn;;
13 method init_rep_encoding : rep_encoding -> unit
14 method init_warner : collect_warnings -> unit
15 method rep_encoding : rep_encoding
16 method open_in : ext_id -> Lexing.lexbuf
17 method close_in : unit
18 method close_all : unit
19 method change_encoding : string -> unit
20 method clone : resolver
25 class virtual resolve_general
28 val mutable internal_encoding = `Enc_utf8
30 val mutable encoding = `Enc_utf8
31 val mutable encoding_requested = false
33 val mutable warner = new drop_warnings
35 val mutable enc_initialized = false
36 val mutable wrn_initialized = false
38 val mutable clones = []
40 method init_rep_encoding e =
41 internal_encoding <- e;
42 enc_initialized <- true;
44 method init_warner w =
46 wrn_initialized <- true;
48 method rep_encoding = (internal_encoding :> rep_encoding)
52 ( {< encoding = `Enc_utf8;
53 encoding_requested = false;
55 : # resolver :> resolver )
58 method private warn (k:int) =
59 (* Called if a character not representable has been found.
60 * k is the character code.
62 if k < 0xd800 or (k >= 0xe000 & k <= 0xfffd) or
63 (k >= 0x10000 & k <= 0x10ffff) then begin
64 warner # warn ("Code point cannot be represented: " ^ string_of_int k);
67 raise (WF_error("Code point " ^ string_of_int k ^
68 " outside the accepted range of code points"))
71 method private autodetect s =
72 (* s must be at least 4 bytes long. The slot 'encoding' is
74 * "UTF-16-BE": UTF-16/UCS-2 encoding big endian
75 * "UTF-16-LE": UTF-16/UCS-2 encoding little endian
76 * "UTF-8": UTF-8 encoding
78 if String.length s < 4 then
80 else if String.sub s 0 2 = "\254\255" then
81 encoding <- `Enc_utf16
82 (* Note: Netconversion.recode will detect the big endianess, too *)
83 else if String.sub s 0 2 = "\255\254" then
84 encoding <- `Enc_utf16
85 (* Note: Netconversion.recode will detect the little endianess, too *)
90 method private virtual next_string : string -> int -> int -> int
91 method private virtual init_in : ext_id -> unit
92 method virtual close_in : unit
95 List.iter (fun r -> r # close_in) clones
98 assert(enc_initialized && wrn_initialized);
100 encoding <- `Enc_utf8;
101 encoding_requested <- false;
102 self # init_in xid; (* may raise Not_competent *)
103 (* init_in: may already set 'encoding' *)
105 let buffer_max = 512 in
106 let buffer = String.make buffer_max ' ' in
107 let buffer_len = ref 0 in
108 let buffer_end = ref false in
110 if not !buffer_end & !buffer_len < buffer_max then begin
112 self # next_string buffer !buffer_len (buffer_max - !buffer_len) in
116 buffer_len := !buffer_len + l
121 let l = !buffer_len - n in
122 String.blit buffer n buffer 0 l;
127 if not encoding_requested then self # autodetect buffer;
131 (* TODO: if encoding = internal_encoding, it is possible to
132 * avoid copying buffer to s because s can be directly used
137 if !buffer_len = 0 then
140 let m_in = !buffer_len in
141 let m_max = if encoding_requested then n else 1 in
142 let n_in, n_out, encoding' =
143 if encoding = (internal_encoding : rep_encoding :> encoding) &&
146 (* Special case encoding = internal_encoding *)
147 String.blit buffer 0 s 0 m_in;
156 ~out_enc:(internal_encoding : rep_encoding :> encoding)
161 ~subst:(fun k -> self # warn k; "")
164 (* An incomplete character at the end of the stream: *)
165 raise Netconversion.Malformed_code;
166 (* failwith "Badly encoded character"; *)
167 encoding <- encoding';
173 method change_encoding enc =
174 if not encoding_requested then begin
175 if enc <> "" then begin
176 match Netconversion.encoding_of_string enc with
179 (`Enc_utf16_le | `Enc_utf16_be) -> ()
180 | `Enc_utf16 -> assert false
182 raise(WF_error "Encoding of data stream and encoding declaration mismatch")
187 (* else: the autodetected encoding counts *)
188 encoding_requested <- true;
194 class resolve_read_any_channel ?(auto_close=true) ~channel_of_id =
196 inherit resolve_general as super
198 val f_open = channel_of_id
199 val mutable current_channel = None
200 val auto_close = auto_close
202 method private init_in (id:ext_id) =
203 if current_channel <> None then
204 failwith "Pxp_reader.resolve_read_any_channel # init_in";
205 let ch, enc_opt = f_open id in (* may raise Not_competent *)
206 begin match enc_opt with
208 | Some enc -> encoding <- enc; encoding_requested <- true
210 current_channel <- Some ch;
212 method private next_string s ofs len =
213 match current_channel with
214 None -> failwith "Pxp_reader.resolve_read_any_channel # next_string"
219 match current_channel with
222 if auto_close then close_in ch;
223 current_channel <- None
226 let c = new resolve_read_any_channel
227 ?auto_close:(Some auto_close) f_open in
228 c # init_rep_encoding internal_encoding;
229 c # init_warner warner;
230 clones <- c :: clones;
237 class resolve_read_this_channel1 is_stale ?id ?fixenc ?auto_close ch =
239 let getchannel = ref (fun xid -> assert false) in
242 inherit resolve_read_any_channel
243 ?auto_close:auto_close
244 (fun xid -> !getchannel xid)
247 val mutable is_stale = is_stale
248 (* The channel can only be read once. To avoid that the channel
249 * is opened several times, the flag 'is_stale' is set after the
258 getchannel := self # getchannel
260 method private getchannel xid =
261 begin match fixid with
264 if xid <> bound_xid then raise Not_competent
268 method private init_in (id:ext_id) =
277 current_channel <- None
280 let c = new resolve_read_this_channel1
282 ?id:fixid ?fixenc:fixenc ?auto_close:(Some auto_close) fixch
284 c # init_rep_encoding internal_encoding;
285 c # init_warner warner;
286 clones <- c :: clones;
293 class resolve_read_this_channel =
294 resolve_read_this_channel1 false
298 class resolve_read_any_string ~string_of_id =
300 inherit resolve_general as super
302 val f_open = string_of_id
303 val mutable current_string = None
304 val mutable current_pos = 0
306 method private init_in (id:ext_id) =
307 if current_string <> None then
308 failwith "Pxp_reader.resolve_read_any_string # init_in";
309 let s, enc_opt = f_open id in (* may raise Not_competent *)
310 begin match enc_opt with
312 | Some enc -> encoding <- enc; encoding_requested <- true
314 current_string <- Some s;
317 method private next_string s ofs len =
318 match current_string with
319 None -> failwith "Pxp_reader.resolve_read_any_string # next_string"
321 let l = min len (String.length str - current_pos) in
322 String.blit str current_pos s ofs l;
323 current_pos <- current_pos + l;
327 match current_string with
330 current_string <- None
333 let c = new resolve_read_any_string f_open in
334 c # init_rep_encoding internal_encoding;
335 c # init_warner warner;
336 clones <- c :: clones;
342 class resolve_read_this_string1 is_stale ?id ?fixenc str =
344 let getstring = ref (fun xid -> assert false) in
347 inherit resolve_read_any_string (fun xid -> !getstring xid) as super
349 val is_stale = is_stale
350 (* For some reasons, it is not allowed to open a clone of the resolver
351 * a second time when the original resolver is already open.
359 getstring := self # getstring
361 method private getstring xid =
362 begin match fixid with
365 if xid <> bound_xid then raise Not_competent
370 method private init_in (id:ext_id) =
377 let c = new resolve_read_this_string1
378 (is_stale or current_string <> None)
379 ?id:fixid ?fixenc:fixenc fixstr
381 c # init_rep_encoding internal_encoding;
382 c # init_warner warner;
383 clones <- c :: clones;
389 class resolve_read_this_string =
390 resolve_read_this_string1 false
394 class resolve_read_url_channel
395 ?(base_url = Neturl.null_url)
403 let getchannel = ref (fun xid -> assert false) in
406 inherit resolve_read_any_channel
407 ?auto_close:auto_close
408 (fun xid -> !getchannel xid)
411 val base_url = base_url
412 val mutable own_url = Neturl.null_url
414 val url_of_id = url_of_id
415 val channel_of_url = channel_of_url
419 getchannel := self # getchannel
421 method private getchannel xid =
422 let rel_url = url_of_id xid in (* may raise Not_competent *)
425 (* Now compute the absolute URL: *)
426 let abs_url = Neturl.apply_relative_url base_url rel_url in
427 (* may raise Malformed_URL *)
429 (* Simple check whether 'abs_url' is really absolute: *)
430 if not(Neturl.url_provides ~scheme:true abs_url)
431 then raise Not_competent;
434 (* FIXME: Copy 'abs_url' ? *)
436 (* Get and return the channel: *)
437 channel_of_url abs_url (* may raise Not_competent *)
439 Neturl.Malformed_URL -> raise (Not_resolvable Neturl.Malformed_URL)
440 | Not_competent -> raise (Not_resolvable Not_found)
444 new resolve_read_url_channel
445 ?base_url:(Some own_url)
446 ?auto_close:(Some auto_close)
448 ~channel_of_url:channel_of_url
450 c # init_rep_encoding internal_encoding;
451 c # init_warner warner;
452 clones <- c :: clones;
453 (c :> resolve_read_url_channel)
458 type spec = [ `Not_recognized | `Allowed | `Required ]
460 class resolve_as_file
461 ?(file_prefix = (`Allowed :> spec))
462 ?(host_prefix = (`Allowed :> spec))
463 ?(system_encoding = `Enc_utf8)
464 ?url_of_id:passed_url_of_id
465 ?channel_of_url:passed_channel_of_url
472 `Not_recognized -> Neturl.Url_part_not_recognized
473 | `Allowed -> Neturl.Url_part_allowed
474 | `Required -> Neturl.Url_part_required
476 { Neturl.null_url_syntax with
477 Neturl.url_enable_scheme = enable_if file_prefix;
478 Neturl.url_enable_host = enable_if host_prefix;
479 Neturl.url_enable_path = Neturl.Url_part_required;
480 Neturl.url_accepts_8bits = true;
484 let base_url_syntax =
485 { Neturl.null_url_syntax with
486 Neturl.url_enable_scheme = Neturl.Url_part_required;
487 Neturl.url_enable_host = Neturl.Url_part_allowed;
488 Neturl.url_enable_path = Neturl.Url_part_required;
489 Neturl.url_accepts_8bits = true;
493 let default_base_url =
497 ~path: (Neturl.split_path (Sys.getcwd() ^ "/"))
501 let file_url_of_id xid =
502 let file_url_of_sysname sysname =
503 (* By convention, we can assume that sysname is a URL conforming
504 * to RFC 1738 with the exception that it may contain non-ASCII
508 Neturl.url_of_string url_syntax sysname
509 (* may raise Malformed_URL *)
511 Neturl.Malformed_URL -> raise Not_competent
515 Anonymous -> raise Not_competent
516 | Public (_,sysname) -> if sysname <> "" then file_url_of_sysname sysname
517 else raise Not_competent
518 | System sysname -> file_url_of_sysname sysname
521 try Neturl.url_scheme url with Not_found -> "file" in
523 try Neturl.url_host url with Not_found -> "" in
525 if scheme <> "file" then raise Not_competent;
526 if host <> "" && host <> "localhost" then raise Not_competent;
531 let channel_of_file_url url =
534 try Neturl.join_path (Neturl.url_path ~encoded:false url)
535 with Not_found -> raise Not_competent
539 Netconversion.recode_string
541 ~out_enc: system_encoding
543 (* May raise Bad_character_stream *)
545 open_in_bin path, None
546 (* May raise Sys_error *)
549 | Netconversion.Malformed_code -> assert false
550 (* should not happen *)
555 match passed_url_of_id with
562 Not_competent -> file_url_of_id id
566 let channel_of_url url =
567 match passed_channel_of_url with
569 channel_of_file_url url
574 Not_competent -> channel_of_file_url url
578 resolve_read_url_channel
579 ~base_url: default_base_url
581 ~url_of_id: url_of_id
582 ~channel_of_url: channel_of_url
586 class combine ?prefer rl =
588 val prefered_resolver = prefer
589 val resolvers = (rl : resolver list)
590 val mutable internal_encoding = `Enc_utf8
591 val mutable warner = new drop_warnings
592 val mutable active_resolver = None
593 val mutable clones = []
595 method init_rep_encoding enc =
597 (fun r -> r # init_rep_encoding enc)
599 internal_encoding <- enc
601 method init_warner w =
603 (fun r -> r # init_warner w)
607 method rep_encoding = internal_encoding
608 (* CAUTION: This may not be the truth! *)
611 let rec find_competent_resolver rl =
617 Not_competent -> find_competent_resolver rl'
623 if active_resolver <> None then failwith "Pxp_reader.combine # open_in";
625 match prefered_resolver with
626 None -> find_competent_resolver resolvers
627 | Some r -> find_competent_resolver (r :: resolvers)
629 active_resolver <- Some r;
633 match active_resolver with
635 | Some r -> r # close_in;
636 active_resolver <- None
639 List.iter (fun r -> r # close_in) clones
641 method change_encoding (enc:string) =
642 match active_resolver with
643 None -> failwith "Pxp_reader.combine # change_encoding"
644 | Some r -> r # change_encoding enc
648 match active_resolver with
650 new combine ?prefer:None (List.map (fun q -> q # clone) resolvers)
652 let r' = r # clone in
656 (fun q -> if q == r then r' else q # clone)
659 c # init_rep_encoding internal_encoding;
660 c # init_warner warner;
661 clones <- c :: clones;
667 (* ======================================================================
671 * Revision 1.1 2000/11/17 09:57:29 lpadovan
674 * Revision 1.9 2000/08/14 22:24:55 gerd
675 * Moved the module Pxp_encoding to the netstring package under
676 * the new name Netconversion.
678 * Revision 1.8 2000/07/16 18:31:09 gerd
679 * The exception Illegal_character has been dropped.
681 * Revision 1.7 2000/07/09 15:32:01 gerd
682 * Fix in resolve_this_channel, resolve_this_string
684 * Revision 1.6 2000/07/09 01:05:33 gerd
685 * New methode 'close_all' that closes the clones, too.
687 * Revision 1.5 2000/07/08 16:24:56 gerd
688 * Introduced the exception 'Not_resolvable' to indicate that
689 * 'combine' should not try the next resolver of the list.
691 * Revision 1.4 2000/07/06 23:04:46 gerd
692 * Quick fix for 'combine': The active resolver is "prefered",
693 * but the other resolvers are also used.
695 * Revision 1.3 2000/07/06 21:43:45 gerd
696 * Fix: Public(_,name) is now treated as System(name) if
699 * Revision 1.2 2000/07/04 22:13:30 gerd
700 * Implemented the new API rev. 1.2 of pxp_reader.mli.
702 * Revision 1.1 2000/05/29 23:48:38 gerd
703 * Changed module names:
704 * Markup_aux into Pxp_aux
705 * Markup_codewriter into Pxp_codewriter
706 * Markup_document into Pxp_document
707 * Markup_dtd into Pxp_dtd
708 * Markup_entity into Pxp_entity
709 * Markup_lexer_types into Pxp_lexer_types
710 * Markup_reader into Pxp_reader
711 * Markup_types into Pxp_types
712 * Markup_yacc into Pxp_yacc
713 * See directory "compatibility" for (almost) compatible wrappers emulating
714 * Markup_document, Markup_dtd, Markup_reader, Markup_types, and Markup_yacc.
716 * ======================================================================
717 * Old logs from markup_reader.ml:
719 * Revision 1.3 2000/05/29 21:14:57 gerd
720 * Changed the type 'encoding' into a polymorphic variant.
722 * Revision 1.2 2000/05/20 20:31:40 gerd
723 * Big change: Added support for various encodings of the
724 * internal representation.
726 * Revision 1.1 2000/03/13 23:41:44 gerd
727 * Initial revision; this code was formerly part of Markup_entity.