2 * ----------------------------------------------------------------------
3 * PXP: The polymorphic XML parser for Objective Caml.
4 * Copyright by Gerd Stolpmann. See LICENSE for details.
8 exception Not_competent;;
9 exception Not_resolvable of exn;;
13 method init_rep_encoding : rep_encoding -> unit
14 method init_warner : collect_warnings -> unit
15 method rep_encoding : rep_encoding
16 method open_in : ext_id -> Lexing.lexbuf
17 method close_in : unit
18 method close_all : unit
19 method change_encoding : string -> unit
20 method clone : resolver
25 class virtual resolve_general
28 val mutable internal_encoding = `Enc_utf8
30 val mutable encoding = `Enc_utf8
31 val mutable encoding_requested = false
33 val mutable warner = new drop_warnings
35 val mutable enc_initialized = false
36 val mutable wrn_initialized = false
38 val mutable clones = []
40 method init_rep_encoding e =
41 internal_encoding <- e;
42 enc_initialized <- true;
44 method init_warner w =
46 wrn_initialized <- true;
48 method rep_encoding = (internal_encoding :> rep_encoding)
52 ( {< encoding = `Enc_utf8;
53 encoding_requested = false;
55 : # resolver :> resolver )
58 method private warn (k:int) =
59 (* Called if a character not representable has been found.
60 * k is the character code.
62 if k < 0xd800 or (k >= 0xe000 & k <= 0xfffd) or
63 (k >= 0x10000 & k <= 0x10ffff) then begin
64 warner # warn ("Code point cannot be represented: " ^ string_of_int k);
67 raise (WF_error("Code point " ^ string_of_int k ^
68 " outside the accepted range of code points"))
71 method private autodetect s =
72 (* s must be at least 4 bytes long. The slot 'encoding' is
74 * "UTF-16-BE": UTF-16/UCS-2 encoding big endian
75 * "UTF-16-LE": UTF-16/UCS-2 encoding little endian
76 * "UTF-8": UTF-8 encoding
78 if String.length s < 4 then
80 else if String.sub s 0 2 = "\254\255" then
81 encoding <- `Enc_utf16
82 (* Note: Netconversion.recode will detect the big endianess, too *)
83 else if String.sub s 0 2 = "\255\254" then
84 encoding <- `Enc_utf16
85 (* Note: Netconversion.recode will detect the little endianess, too *)
90 method private virtual next_string : string -> int -> int -> int
91 method private virtual init_in : ext_id -> unit
92 method virtual close_in : unit
95 List.iter (fun r -> r # close_in) clones
98 assert(enc_initialized && wrn_initialized);
100 encoding <- `Enc_utf8;
101 encoding_requested <- false;
102 self # init_in xid; (* may raise Not_competent *)
103 (* init_in: may already set 'encoding' *)
105 let buffer_max = 512 in
106 let buffer = String.make buffer_max ' ' in
107 let buffer_len = ref 0 in
108 let buffer_end = ref false in
110 if not !buffer_end & !buffer_len < buffer_max then begin
112 self # next_string buffer !buffer_len (buffer_max - !buffer_len) in
116 buffer_len := !buffer_len + l
121 let l = !buffer_len - n in
122 String.blit buffer n buffer 0 l;
127 if not encoding_requested then self # autodetect buffer;
131 (* TODO: if encoding = internal_encoding, it is possible to
132 * avoid copying buffer to s because s can be directly used
137 if !buffer_len = 0 then
140 let m_in = !buffer_len in
141 let m_max = if encoding_requested then n else 1 in
142 let n_in, n_out, encoding' =
143 if encoding = (internal_encoding : rep_encoding :> encoding) &&
146 (* Special case encoding = internal_encoding *)
147 String.blit buffer 0 s 0 m_in;
156 ~out_enc:(internal_encoding : rep_encoding :> encoding)
161 ~subst:(fun k -> self # warn k; "")
164 (* An incomplete character at the end of the stream: *)
165 raise Netconversion.Malformed_code;
166 (* failwith "Badly encoded character"; *)
167 encoding <- encoding';
173 method change_encoding enc =
174 if not encoding_requested then begin
175 if enc <> "" then begin
176 match Netconversion.encoding_of_string enc with
179 (`Enc_utf16_le | `Enc_utf16_be) -> ()
180 | `Enc_utf16 -> assert false
182 raise(WF_error "Encoding of data stream and encoding declaration mismatch")
187 (* else: the autodetected encoding counts *)
188 encoding_requested <- true;
194 class resolve_read_any_channel ?(close=close_in) ~channel_of_id () =
196 inherit resolve_general as super
198 val f_open = channel_of_id
199 val mutable current_channel = None
202 method private init_in (id:ext_id) =
203 if current_channel <> None then
204 failwith "Pxp_reader.resolve_read_any_channel # init_in";
205 let ch, enc_opt = f_open id in (* may raise Not_competent *)
206 begin match enc_opt with
208 | Some enc -> encoding <- enc; encoding_requested <- true
210 current_channel <- Some ch;
212 method private next_string s ofs len =
213 match current_channel with
214 None -> failwith "Pxp_reader.resolve_read_any_channel # next_string"
219 match current_channel with
223 current_channel <- None
226 let c = new resolve_read_any_channel
227 ?close:(Some close) ~channel_of_id:f_open () in
228 c # init_rep_encoding internal_encoding;
229 c # init_warner warner;
230 clones <- c :: clones;
237 class resolve_read_this_channel1 is_stale ?id ?fixenc ?close ch =
239 let getchannel = ref (fun xid -> assert false) in
242 inherit resolve_read_any_channel
244 ~channel_of_id:(fun xid -> !getchannel xid)
248 val mutable is_stale = is_stale
249 (* The channel can only be read once. To avoid that the channel
250 * is opened several times, the flag 'is_stale' is set after the
259 getchannel := self # getchannel
261 method private getchannel xid =
262 begin match fixid with
265 if xid <> bound_xid then raise Not_competent
269 method private init_in (id:ext_id) =
278 current_channel <- None
281 let c = new resolve_read_this_channel1
283 ?id:fixid ?fixenc:fixenc ?close:(Some close) fixch
285 c # init_rep_encoding internal_encoding;
286 c # init_warner warner;
287 clones <- c :: clones;
294 class resolve_read_this_channel =
295 resolve_read_this_channel1 false
299 class resolve_read_any_string ~string_of_id () =
301 inherit resolve_general as super
303 val f_open = string_of_id
304 val mutable current_string = None
305 val mutable current_pos = 0
307 method private init_in (id:ext_id) =
308 if current_string <> None then
309 failwith "Pxp_reader.resolve_read_any_string # init_in";
310 let s, enc_opt = f_open id in (* may raise Not_competent *)
311 begin match enc_opt with
313 | Some enc -> encoding <- enc; encoding_requested <- true
315 current_string <- Some s;
318 method private next_string s ofs len =
319 match current_string with
320 None -> failwith "Pxp_reader.resolve_read_any_string # next_string"
322 let l = min len (String.length str - current_pos) in
323 String.blit str current_pos s ofs l;
324 current_pos <- current_pos + l;
328 match current_string with
331 current_string <- None
334 let c = new resolve_read_any_string ~string_of_id:f_open () in
335 c # init_rep_encoding internal_encoding;
336 c # init_warner warner;
337 clones <- c :: clones;
343 class resolve_read_this_string1 is_stale ?id ?fixenc str =
345 let getstring = ref (fun xid -> assert false) in
348 inherit resolve_read_any_string (fun xid -> !getstring xid) () as super
350 val is_stale = is_stale
351 (* For some reasons, it is not allowed to open a clone of the resolver
352 * a second time when the original resolver is already open.
360 getstring := self # getstring
362 method private getstring xid =
363 begin match fixid with
366 if xid <> bound_xid then raise Not_competent
371 method private init_in (id:ext_id) =
378 let c = new resolve_read_this_string1
379 (is_stale or current_string <> None)
380 ?id:fixid ?fixenc:fixenc fixstr
382 c # init_rep_encoding internal_encoding;
383 c # init_warner warner;
384 clones <- c :: clones;
390 class resolve_read_this_string =
391 resolve_read_this_string1 false
395 class resolve_read_url_channel
396 ?(base_url = Neturl.null_url)
405 let getchannel = ref (fun xid -> assert false) in
408 inherit resolve_read_any_channel
410 ~channel_of_id:(fun xid -> !getchannel xid)
414 val base_url = base_url
415 val mutable own_url = Neturl.null_url
417 val url_of_id = url_of_id
418 val channel_of_url = channel_of_url
422 getchannel := self # getchannel
424 method private getchannel xid =
425 let rel_url = url_of_id xid in (* may raise Not_competent *)
428 (* Now compute the absolute URL: *)
430 if Neturl.url_provides ~scheme:true rel_url then
433 Neturl.apply_relative_url base_url rel_url in
434 (* may raise Malformed_URL *)
436 (* Simple check whether 'abs_url' is really absolute: *)
437 if not(Neturl.url_provides ~scheme:true abs_url)
438 then raise Not_competent;
441 (* FIXME: Copy 'abs_url' ? *)
443 (* Get and return the channel: *)
444 channel_of_url xid abs_url (* may raise Not_competent *)
446 Neturl.Malformed_URL -> raise (Not_resolvable Neturl.Malformed_URL)
447 | Not_competent -> raise (Not_resolvable Not_found)
451 new resolve_read_url_channel
452 ?base_url:(Some own_url)
455 ~channel_of_url:channel_of_url
458 c # init_rep_encoding internal_encoding;
459 c # init_warner warner;
460 clones <- c :: clones;
461 (c :> resolve_read_url_channel)
466 type spec = [ `Not_recognized | `Allowed | `Required ]
468 class resolve_as_file
469 ?(file_prefix = (`Allowed :> spec))
470 ?(host_prefix = (`Allowed :> spec))
471 ?(system_encoding = `Enc_utf8)
472 ?(map_private_id = (fun _ -> raise Not_competent))
473 ?(open_private_id = (fun _ -> raise Not_competent))
480 `Not_recognized -> Neturl.Url_part_not_recognized
481 | `Allowed -> Neturl.Url_part_allowed
482 | `Required -> Neturl.Url_part_required
484 { Neturl.null_url_syntax with
485 Neturl.url_enable_scheme = enable_if file_prefix;
486 Neturl.url_enable_host = enable_if host_prefix;
487 Neturl.url_enable_path = Neturl.Url_part_required;
488 Neturl.url_accepts_8bits = true;
492 let base_url_syntax =
493 { Neturl.null_url_syntax with
494 Neturl.url_enable_scheme = Neturl.Url_part_required;
495 Neturl.url_enable_host = Neturl.Url_part_allowed;
496 Neturl.url_enable_path = Neturl.Url_part_required;
497 Neturl.url_accepts_8bits = true;
501 let default_base_url =
505 ~path: (Neturl.split_path (Sys.getcwd() ^ "/"))
509 let file_url_of_id xid =
510 let file_url_of_sysname sysname =
511 (* By convention, we can assume that sysname is a URL conforming
512 * to RFC 1738 with the exception that it may contain non-ASCII
516 Neturl.url_of_string url_syntax sysname
517 (* may raise Malformed_URL *)
519 Neturl.Malformed_URL -> raise Not_competent
523 Anonymous -> raise Not_competent
524 | Public (_,sysname) -> if sysname <> "" then file_url_of_sysname sysname
525 else raise Not_competent
526 | System sysname -> file_url_of_sysname sysname
527 | Private pid -> map_private_id pid
530 try Neturl.url_scheme url with Not_found -> "file" in
532 try Neturl.url_host url with Not_found -> "" in
534 if scheme <> "file" then raise Not_competent;
535 if host <> "" && host <> "localhost" then raise Not_competent;
540 let channel_of_file_url xid url =
542 Private pid -> open_private_id pid
546 try Neturl.join_path (Neturl.url_path ~encoded:false url)
547 with Not_found -> raise Not_competent
551 Netconversion.recode_string
553 ~out_enc: system_encoding
555 (* May raise Malformed_code *)
557 open_in_bin path, None
558 (* May raise Sys_error *)
561 | Netconversion.Malformed_code -> assert false
562 (* should not happen *)
563 | Sys_error _ as e ->
564 raise (Not_resolvable e)
568 resolve_read_url_channel
569 ~base_url: default_base_url
570 ~url_of_id: file_url_of_id
571 ~channel_of_url: channel_of_file_url
576 let make_file_url ?(system_encoding = `Enc_utf8) ?(enc = `Enc_utf8) filename =
578 Netconversion.recode_string
584 let utf8_abs_filename =
585 if utf8_filename <> "" && utf8_filename.[0] = '/' then
588 let cwd = Sys.getcwd() in
590 Netconversion.recode_string
591 ~in_enc: system_encoding
592 ~out_enc: `Enc_utf8 in
593 cwd ^ "/" ^ utf8_filename
596 let syntax = { Neturl.ip_url_syntax with Neturl.url_accepts_8bits = true } in
597 let url = Neturl.make_url
600 ~path:(Neturl.split_path utf8_abs_filename)
607 class lookup_public_id (catalog : (string * resolver) list) =
609 List.map (fun (id,s) -> Pxp_aux.normalize_public_id id, s) catalog in
611 val cat = norm_catalog
612 val mutable internal_encoding = `Enc_utf8
613 val mutable warner = new drop_warnings
614 val mutable active_resolver = None
616 method init_rep_encoding enc =
617 internal_encoding <- enc
619 method init_warner w =
622 method rep_encoding = internal_encoding
623 (* CAUTION: This may not be the truth! *)
627 if active_resolver <> None then failwith "Pxp_reader.lookup_* # open_in";
633 (* Search pubid in catalog: *)
635 let norm_pubid = Pxp_aux.normalize_public_id pubid in
636 List.assoc norm_pubid cat
645 let r' = r # clone in
646 r' # init_rep_encoding internal_encoding;
647 r' # init_warner warner;
648 let lb = r' # open_in xid in (* may raise Not_competent *)
649 active_resolver <- Some r';
653 match active_resolver with
655 | Some r -> r # close_in;
656 active_resolver <- None
661 method change_encoding (enc:string) =
662 match active_resolver with
663 None -> failwith "Pxp_reader.lookup_* # change_encoding"
664 | Some r -> r # change_encoding enc
667 let c = new lookup_public_id cat in
668 c # init_rep_encoding internal_encoding;
669 c # init_warner warner;
675 let lookup_public_id_as_file ?(fixenc:encoding option) catalog =
676 let ch_of_id filename id =
677 let ch = open_in_bin filename in (* may raise Sys_error *)
683 (id, new resolve_read_any_channel (ch_of_id s) ())
687 new lookup_public_id catalog'
691 let lookup_public_id_as_string ?(fixenc:encoding option) catalog =
695 (id, new resolve_read_any_string (fun _ -> s, fixenc) ())
699 new lookup_public_id catalog'
703 class lookup_system_id (catalog : (string * resolver) list) =
706 val mutable internal_encoding = `Enc_utf8
707 val mutable warner = new drop_warnings
708 val mutable active_resolver = None
710 method init_rep_encoding enc =
711 internal_encoding <- enc
713 method init_warner w =
716 method rep_encoding = internal_encoding
717 (* CAUTION: This may not be the truth! *)
722 if active_resolver <> None then failwith "Pxp_reader.lookup_system_id # open_in";
734 System sysid -> lookup sysid
735 | Public(_,sysid) -> lookup sysid
736 | _ -> raise Not_competent
739 let r' = r # clone in
740 r' # init_rep_encoding internal_encoding;
741 r' # init_warner warner;
742 let lb = r' # open_in xid in (* may raise Not_competent *)
743 active_resolver <- Some r';
748 match active_resolver with
750 | Some r -> r # close_in;
751 active_resolver <- None
756 method change_encoding (enc:string) =
757 match active_resolver with
758 None -> failwith "Pxp_reader.lookup_system # change_encoding"
759 | Some r -> r # change_encoding enc
762 let c = new lookup_system_id cat in
763 c # init_rep_encoding internal_encoding;
764 c # init_warner warner;
770 let lookup_system_id_as_file ?(fixenc:encoding option) catalog =
771 let ch_of_id filename id =
772 let ch = open_in_bin filename in (* may raise Sys_error *)
778 (id, new resolve_read_any_channel (ch_of_id s) ())
782 new lookup_system_id catalog'
786 let lookup_system_id_as_string ?(fixenc:encoding option) catalog =
790 (id, new resolve_read_any_string (fun _ -> s, fixenc) ())
794 new lookup_system_id catalog'
798 type combination_mode =
800 | System_before_public
804 class combine ?prefer ?(mode = Public_before_system) rl =
806 val prefered_resolver = prefer
808 val resolvers = (rl : resolver list)
809 val mutable internal_encoding = `Enc_utf8
810 val mutable warner = new drop_warnings
811 val mutable active_resolver = None
812 val mutable clones = []
814 method init_rep_encoding enc =
816 (fun r -> r # init_rep_encoding enc)
818 internal_encoding <- enc
820 method init_warner w =
822 (fun r -> r # init_warner w)
826 method rep_encoding = internal_encoding
827 (* CAUTION: This may not be the truth! *)
830 let rec find_competent_resolver_for xid' rl =
834 r, (r # open_in xid')
836 Not_competent -> find_competent_resolver_for xid' rl'
842 let find_competent_resolver rl =
844 Public(pubid,sysid) ->
846 Public_before_system ->
848 find_competent_resolver_for(Public(pubid,"")) rl
851 find_competent_resolver_for(System sysid) rl
853 | System_before_public ->
855 find_competent_resolver_for(System sysid) rl
858 find_competent_resolver_for(Public(pubid,"")) rl
862 find_competent_resolver_for other rl
865 if active_resolver <> None then failwith "Pxp_reader.combine # open_in";
867 match prefered_resolver with
868 None -> find_competent_resolver resolvers
869 | Some r -> find_competent_resolver (r :: resolvers)
871 active_resolver <- Some r;
875 match active_resolver with
877 | Some r -> r # close_in;
878 active_resolver <- None
881 List.iter (fun r -> r # close_in) clones
883 method change_encoding (enc:string) =
884 match active_resolver with
885 None -> failwith "Pxp_reader.combine # change_encoding"
886 | Some r -> r # change_encoding enc
890 match active_resolver with
892 new combine ?prefer:None ?mode:(Some mode)
893 (List.map (fun q -> q # clone) resolvers)
895 let r' = r # clone in
900 (fun q -> if q == r then r' else q # clone)
903 c # init_rep_encoding internal_encoding;
904 c # init_warner warner;
905 clones <- c :: clones;
911 (* ======================================================================
915 * Revision 1.2 2002/01/29 14:44:29 sacerdot
916 * Ported to ocaml-3.04.
918 * Revision 1.1 2001/11/26 18:28:28 sacerdot
919 * HELM OCaml libraries with findlib support.
921 * Revision 1.1 2001/10/24 15:33:16 sacerdot
922 * New procedure to create metadata committed and old procedure removed.
923 * The new procedure is based on ocaml code and builds metadata for both
924 * forward and backward pointers. The old one was based on a stylesheet.
926 * Revision 1.16 2001/07/01 09:46:40 gerd
927 * Fix: resolve_read_url_channel does not use the base_url if
928 * the current URL is already absolute
930 * Revision 1.15 2001/07/01 08:35:23 gerd
931 * Instead of the ~auto_close argument, there is now a
932 * ~close argument for several functions/classes. This allows some
933 * additional action when the resolver is closed.
935 * Revision 1.14 2001/06/14 23:28:02 gerd
936 * Fix: class combine works now with private IDs.
938 * Revision 1.13 2001/04/22 14:16:48 gerd
939 * resolve_as_file: you can map private IDs to arbitrary channels.
940 * resolve_read_url_channel: changed type of the channel_of_url
941 * argument (ext_id is also passed)
942 * More examples and documentation.
944 * Revision 1.12 2001/04/21 17:40:48 gerd
945 * Bugfix in 'combine'
947 * Revision 1.11 2001/04/03 20:22:44 gerd
948 * New resolvers for catalogs of PUBLIC and SYSTEM IDs.
949 * Improved "combine": PUBLIC and SYSTEM IDs are handled
951 * Rewritten from_file: Is now a simple application of the
952 * Pxp_reader classes and functions. (The same has still to be done
955 * Revision 1.10 2001/02/01 20:38:49 gerd
956 * New support for PUBLIC identifiers.
958 * Revision 1.9 2000/08/14 22:24:55 gerd
959 * Moved the module Pxp_encoding to the netstring package under
960 * the new name Netconversion.
962 * Revision 1.8 2000/07/16 18:31:09 gerd
963 * The exception Illegal_character has been dropped.
965 * Revision 1.7 2000/07/09 15:32:01 gerd
966 * Fix in resolve_this_channel, resolve_this_string
968 * Revision 1.6 2000/07/09 01:05:33 gerd
969 * New methode 'close_all' that closes the clones, too.
971 * Revision 1.5 2000/07/08 16:24:56 gerd
972 * Introduced the exception 'Not_resolvable' to indicate that
973 * 'combine' should not try the next resolver of the list.
975 * Revision 1.4 2000/07/06 23:04:46 gerd
976 * Quick fix for 'combine': The active resolver is "prefered",
977 * but the other resolvers are also used.
979 * Revision 1.3 2000/07/06 21:43:45 gerd
980 * Fix: Public(_,name) is now treated as System(name) if
983 * Revision 1.2 2000/07/04 22:13:30 gerd
984 * Implemented the new API rev. 1.2 of pxp_reader.mli.
986 * Revision 1.1 2000/05/29 23:48:38 gerd
987 * Changed module names:
988 * Markup_aux into Pxp_aux
989 * Markup_codewriter into Pxp_codewriter
990 * Markup_document into Pxp_document
991 * Markup_dtd into Pxp_dtd
992 * Markup_entity into Pxp_entity
993 * Markup_lexer_types into Pxp_lexer_types
994 * Markup_reader into Pxp_reader
995 * Markup_types into Pxp_types
996 * Markup_yacc into Pxp_yacc
997 * See directory "compatibility" for (almost) compatible wrappers emulating
998 * Markup_document, Markup_dtd, Markup_reader, Markup_types, and Markup_yacc.
1000 * ======================================================================
1001 * Old logs from markup_reader.ml:
1003 * Revision 1.3 2000/05/29 21:14:57 gerd
1004 * Changed the type 'encoding' into a polymorphic variant.
1006 * Revision 1.2 2000/05/20 20:31:40 gerd
1007 * Big change: Added support for various encodings of the
1008 * internal representation.
1010 * Revision 1.1 2000/03/13 23:41:44 gerd
1011 * Initial revision; this code was formerly part of Markup_entity.