--- /dev/null
+(* $Id$
+ * ----------------------------------------------------------------------
+ *)
+
+exception Malformed_code
+
+
+type encoding =
+ [ `Enc_utf8 (* UTF-8 *)
+ | `Enc_java
+ | `Enc_utf16 (* UTF-16 with unspecified endianess (restricted usage) *)
+ | `Enc_utf16_le (* UTF-16 little endian *)
+ | `Enc_utf16_be (* UTF-16 big endian *)
+ | `Enc_usascii (* US-ASCII (only 7 bit) *)
+ | `Enc_iso88591 (* ISO-8859-1 *)
+ | `Enc_iso88592 (* ISO-8859-2 *)
+ | `Enc_iso88593 (* ISO-8859-3 *)
+ | `Enc_iso88594 (* ISO-8859-4 *)
+ | `Enc_iso88595 (* ISO-8859-5 *)
+ | `Enc_iso88596 (* ISO-8859-6 *)
+ | `Enc_iso88597 (* ISO-8859-7 *)
+ | `Enc_iso88598 (* ISO-8859-8 *)
+ | `Enc_iso88599 (* ISO-8859-9 *)
+ | `Enc_iso885910 (* ISO-8859-10 *)
+ | `Enc_iso885913 (* ISO-8859-13 *)
+ | `Enc_iso885914 (* ISO-8859-14 *)
+ | `Enc_iso885915 (* ISO-8859-15 *)
+ | `Enc_koi8r (* KOI8-R *)
+ | `Enc_jis0201 (* JIS-0201 *)
+ (* Microsoft: *)
+ | `Enc_windows1250 (* WINDOWS-1250 *)
+ | `Enc_windows1251 (* WINDOWS-1251 *)
+ | `Enc_windows1252 (* WINDOWS-1252 *)
+ | `Enc_windows1253 (* WINDOWS-1253 *)
+ | `Enc_windows1254 (* WINDOWS-1254 *)
+ | `Enc_windows1255 (* WINDOWS-1255 *)
+ | `Enc_windows1256 (* WINDOWS-1256 *)
+ | `Enc_windows1257 (* WINDOWS-1257 *)
+ | `Enc_windows1258 (* WINDOWS-1258 *)
+ (* IBM, ASCII-based: *)
+ | `Enc_cp437
+ | `Enc_cp737
+ | `Enc_cp775
+ | `Enc_cp850
+ | `Enc_cp852
+ | `Enc_cp855
+ | `Enc_cp856
+ | `Enc_cp857
+ | `Enc_cp860
+ | `Enc_cp861
+ | `Enc_cp862
+ | `Enc_cp863
+ | `Enc_cp864
+ | `Enc_cp865
+ | `Enc_cp866
+ | `Enc_cp869
+ | `Enc_cp874
+ | `Enc_cp1006
+ (* IBM, EBCDIC-based: *)
+ | `Enc_cp037
+ | `Enc_cp424
+ | `Enc_cp500
+ | `Enc_cp875
+ | `Enc_cp1026
+ (* Adobe: *)
+ | `Enc_adobe_standard_encoding
+ | `Enc_adobe_symbol_encoding
+ | `Enc_adobe_zapf_dingbats_encoding
+ (* Apple: *)
+ | `Enc_macroman
+
+ ]
+;;
+
+
+let norm_enc_name e =
+ (* Removes some characters from e; uppercase *)
+ let e' = String.create (String.length e) in
+ let rec next i j =
+ if i < String.length e then
+ match e.[i] with
+ ('-'|'_'|'.') -> next (i+1) j
+ | c -> e'.[j] <- c; next (i+1) (j+1)
+ else
+ j
+ in
+ let l = next 0 0 in
+ String.uppercase(String.sub e' 0 l)
+;;
+
+
+let encoding_of_string e =
+ match norm_enc_name e with
+ ("UTF16"|"UCS2"|"ISO10646UCS2") -> `Enc_utf16
+ | "UTF16BE" -> `Enc_utf16_be
+ | "UTF16LE" -> `Enc_utf16_le
+ | "UTF8" -> `Enc_utf8
+ | ("UTF8JAVA"|"JAVA") -> `Enc_java
+ | ("USASCII"|"ASCII"|"ISO646US"|"IBM367"|"CP367") -> `Enc_usascii
+ | ("ISO88591"|"LATIN1"|"IBM819"|"CP819") -> `Enc_iso88591
+ | ("ISO88592"|"LATIN2") -> `Enc_iso88592
+ | ("ISO88593"|"LATIN3") -> `Enc_iso88593
+ | ("ISO88594"|"LATIN4") -> `Enc_iso88594
+ | ("ISO88595"|"CYRILLIC") -> `Enc_iso88595
+ | ("ISO88596"|"ARABIC"|"ECMA114"|"ASMO708") -> `Enc_iso88596
+ | ("ISO88597"|"GREEK"|"GREEK8"|"ELOT928"|"ECMA118") -> `Enc_iso88597
+ | ("ISO88598"|"HEBREW") -> `Enc_iso88598
+ | ("ISO88599"|"LATIN5") -> `Enc_iso88599
+ | ("ISO885910"|"LATIN6") -> `Enc_iso885910
+ | "ISO885913" -> `Enc_iso885913
+ | "ISO885914" -> `Enc_iso885914
+ | "ISO885915" -> `Enc_iso885915
+ | "KOI8R" -> `Enc_koi8r
+ | ("JIS0201"|"JISX0201"|"X0201") -> `Enc_jis0201
+
+ | "WINDOWS1250" -> `Enc_windows1250
+ | "WINDOWS1251" -> `Enc_windows1251
+ | "WINDOWS1252" -> `Enc_windows1252
+ | "WINDOWS1253" -> `Enc_windows1253
+ | "WINDOWS1254" -> `Enc_windows1254
+ | "WINDOWS1255" -> `Enc_windows1255
+ | "WINDOWS1256" -> `Enc_windows1256
+ | "WINDOWS1257" -> `Enc_windows1257
+ | "WINDOWS1258" -> `Enc_windows1258
+
+ | ("CP437"|"IBM437") -> `Enc_cp437
+ | ("CP737"|"IBM737") -> `Enc_cp737
+ | ("CP775"|"IBM775") -> `Enc_cp775
+ | ("CP850"|"IBM850") -> `Enc_cp850
+ | ("CP852"|"IBM852") -> `Enc_cp852
+ | ("CP855"|"IBM855") -> `Enc_cp855
+ | ("CP856"|"IBM856") -> `Enc_cp856
+ | ("CP857"|"IBM857") -> `Enc_cp857
+ | ("CP860"|"IBM860") -> `Enc_cp860
+ | ("CP861"|"IBM861") -> `Enc_cp861
+ | ("CP862"|"IBM862") -> `Enc_cp862
+ | ("CP863"|"IBM863") -> `Enc_cp863
+ | ("CP864"|"IBM864") -> `Enc_cp864
+ | ("CP865"|"IBM865") -> `Enc_cp865
+ | ("CP866"|"IBM866") -> `Enc_cp866
+ | ("CP869"|"IBM869") -> `Enc_cp869
+ | ("CP874"|"IBM874") -> `Enc_cp874
+ | ("CP1006"|"IBM1006") -> `Enc_cp1006
+
+ | ("CP037"|"IBM037"|"EBCDICCPUS"|"EBCDICCPCA"|"EBCDICCPWT"|
+ "EBCDICCPNL") -> `Enc_cp037
+ | ("CP424"|"IBM424"|"EBCDICCPHE") -> `Enc_cp424
+ | ("CP500"|"IBM500"|"EBCDICCPBE"|"EBCDICCPCH") -> `Enc_cp500
+ | ("CP875"|"IBM875") -> `Enc_cp875
+ | ("CP1026"|"IBM1026") -> `Enc_cp1026
+
+ | "ADOBESTANDARDENCODING" -> `Enc_adobe_standard_encoding
+ | "ADOBESYMBOLENCODING" -> `Enc_adobe_symbol_encoding
+ | "ADOBEZAPFDINGBATSENCODING" -> `Enc_adobe_zapf_dingbats_encoding
+
+ | "MACINTOSH" -> `Enc_macroman
+
+ | _ ->
+ failwith "Netconversion.encoding_of_string: unknown encoding"
+;;
+
+
+let string_of_encoding (e : encoding) =
+ (* If there is a "preferred MIME name", this name is returned (see IANA). *)
+ match e with
+ `Enc_utf16 -> "UTF-16"
+ | `Enc_utf16_be -> "UTF-16-BE"
+ | `Enc_utf16_le -> "UTF-16-LE"
+ | `Enc_utf8 -> "UTF-8"
+ | `Enc_java -> "UTF-8-JAVA"
+ | `Enc_usascii -> "US-ASCII"
+ | `Enc_iso88591 -> "ISO-8859-1"
+ | `Enc_iso88592 -> "ISO-8859-2"
+ | `Enc_iso88593 -> "ISO-8859-3"
+ | `Enc_iso88594 -> "ISO-8859-4"
+ | `Enc_iso88595 -> "ISO-8859-5"
+ | `Enc_iso88596 -> "ISO-8859-6"
+ | `Enc_iso88597 -> "ISO-8859-7"
+ | `Enc_iso88598 -> "ISO-8859-8"
+ | `Enc_iso88599 -> "ISO-8859-9"
+ | `Enc_iso885910 -> "ISO-8859-10"
+ | `Enc_iso885913 -> "ISO-8859-13"
+ | `Enc_iso885914 -> "ISO-8859-14"
+ | `Enc_iso885915 -> "ISO-8859-15"
+ | `Enc_koi8r -> "KOI8-R"
+ | `Enc_jis0201 -> "JIS_X0201"
+ | `Enc_windows1250 -> "WINDOWS-1250"
+ | `Enc_windows1251 -> "WINDOWS-1251"
+ | `Enc_windows1252 -> "WINDOWS-1252"
+ | `Enc_windows1253 -> "WINDOWS-1253"
+ | `Enc_windows1254 -> "WINDOWS-1254"
+ | `Enc_windows1255 -> "WINDOWS-1255"
+ | `Enc_windows1256 -> "WINDOWS-1256"
+ | `Enc_windows1257 -> "WINDOWS-1257"
+ | `Enc_windows1258 -> "WINDOWS-1258"
+ | `Enc_cp437 -> "CP437"
+ | `Enc_cp737 -> "CP737"
+ | `Enc_cp775 -> "CP775"
+ | `Enc_cp850 -> "CP850"
+ | `Enc_cp852 -> "CP852"
+ | `Enc_cp855 -> "CP855"
+ | `Enc_cp856 -> "CP856"
+ | `Enc_cp857 -> "CP857"
+ | `Enc_cp860 -> "CP860"
+ | `Enc_cp861 -> "CP861"
+ | `Enc_cp862 -> "CP862"
+ | `Enc_cp863 -> "CP863"
+ | `Enc_cp864 -> "CP864"
+ | `Enc_cp865 -> "CP865"
+ | `Enc_cp866 -> "CP866"
+ | `Enc_cp869 -> "CP869"
+ | `Enc_cp874 -> "CP874"
+ | `Enc_cp1006 -> "CP1006"
+ | `Enc_cp037 -> "CP037"
+ | `Enc_cp424 -> "CP424"
+ | `Enc_cp500 -> "CP500"
+ | `Enc_cp875 -> "CP875"
+ | `Enc_cp1026 -> "CP1026"
+ | `Enc_adobe_standard_encoding -> "ADOBE-STANDARD-ENCODING"
+ | `Enc_adobe_symbol_encoding -> "ADOBE-SYMBOL-ENCODING"
+ | `Enc_adobe_zapf_dingbats_encoding -> "ADOBE-ZAPF-DINGBATS-ENCODING"
+ | `Enc_macroman -> "MACINTOSH"
+;;
+
+
+let read_iso88591 write s_in p_in l_in =
+ let rec scan k_in k_out c_out =
+ if k_in < l_in then begin
+ let p = Char.code s_in.[p_in + k_in] in
+ let n = write p k_out c_out in
+ if n < 0 then
+ k_in, k_out, `Enc_iso88591
+ else
+ scan (k_in + 1) (k_out + n) (c_out + 1)
+ end
+ else
+ k_in, k_out, `Enc_iso88591
+ in
+ scan 0 0 0
+;;
+
+
+let read_usascii write s_in p_in l_in =
+ let rec scan k_in k_out c_out =
+ if k_in < l_in then begin
+ let p = Char.code s_in.[p_in + k_in] in
+ if p >= 0x80 then raise Malformed_code;
+ let n = write p k_out c_out in
+ if n < 0 then
+ k_in, k_out, `Enc_usascii
+ else
+ scan (k_in + 1) (k_out + n) (c_out + 1)
+ end
+ else
+ k_in, k_out, `Enc_usascii
+ in
+ scan 0 0 0
+;;
+
+
+let read_8bit m_to_unicode enc write s_in p_in l_in =
+ let rec scan k_in k_out c_out =
+ if k_in < l_in then begin
+ let p_local = Char.code s_in.[p_in + k_in] in
+ let p_uni = Array.unsafe_get m_to_unicode p_local in
+ if p_uni < 0 then raise Malformed_code;
+ let n = write p_uni k_out c_out in
+ if n < 0 then
+ k_in, k_out, enc
+ else
+ scan (k_in + 1) (k_out + n) (c_out + 1)
+ end
+ else
+ k_in, k_out, enc
+ in
+ scan 0 0 0
+;;
+
+
+let read_utf8 is_java write s_in p_in l_in =
+ let rec scan k_in k_out c_out =
+ if k_in < l_in then begin
+ let n_out, n_in =
+ match s_in.[p_in + k_in] with
+ '\000' ->
+ if is_java then raise Malformed_code;
+ write 0 k_out c_out, 1
+ | ('\001'..'\127' as c) ->
+ write (Char.code c) k_out c_out, 1
+ | ('\128'..'\223' as c) ->
+ if k_in + 1 >= l_in then
+ -1, 0
+ else begin
+ let n1 = Char.code c in
+ let n2 = Char.code (s_in.[p_in + k_in + 1]) in
+ if is_java && (n1 = 0x80 && n2 = 0xc0) then
+ write 0 k_out c_out, 2
+ else begin
+ if n2 < 128 or n2 > 191 then raise Malformed_code;
+ let p = ((n1 land 0b11111) lsl 6) lor (n2 land 0b111111) in
+ if p < 128 then raise Malformed_code;
+ write p k_out c_out, 2
+ end
+ end
+ | ('\224'..'\239' as c) ->
+ if k_in + 2 >= l_in then
+ -1, 0
+ else begin
+ let n1 = Char.code c in
+ let n2 = Char.code (s_in.[p_in + k_in + 1]) in
+ let n3 = Char.code (s_in.[p_in + k_in + 2]) in
+ if n2 < 128 or n2 > 191 then raise Malformed_code;
+ if n3 < 128 or n3 > 191 then raise Malformed_code;
+ let p =
+ ((n1 land 0b1111) lsl 12) lor
+ ((n2 land 0b111111) lsl 6) lor
+ (n3 land 0b111111)
+ in
+ if p < 0x800 then raise Malformed_code;
+ if (p >= 0xd800 && p < 0xe000) then
+ (* Surrogate pairs are not supported in UTF-8 *)
+ raise Malformed_code;
+ if (p >= 0xfffe && p <= 0xffff) then
+ raise Malformed_code;
+ write p k_out c_out, 3
+ end
+ | ('\240'..'\247' as c) ->
+ if k_in + 3 >= l_in then
+ -1, 0
+ else begin
+ let n1 = Char.code c in
+ let n2 = Char.code (s_in.[p_in + k_in + 1]) in
+ let n3 = Char.code (s_in.[p_in + k_in + 2]) in
+ let n4 = Char.code (s_in.[p_in + k_in + 3]) in
+ if n2 < 128 or n2 > 191 then raise Malformed_code;
+ if n3 < 128 or n3 > 191 then raise Malformed_code;
+ if n4 < 128 or n4 > 191 then raise Malformed_code;
+ let p = ((n1 land 0b111) lsl 18) lor
+ ((n2 land 0b111111) lsl 12) lor
+ ((n3 land 0b111111) lsl 6) lor
+ (n4 land 0b111111)
+ in
+ if p < 0x10000 then raise Malformed_code;
+ if p >= 0x110000 then
+ (* These code points are not supported. *)
+ raise Malformed_code;
+ write p k_out c_out, 4
+ end
+ | _ ->
+ (* Outside the valid range of XML characters *)
+ raise Malformed_code;
+ in
+ (* n_out: number of written bytes; -1 means out buf is full
+ * n_in: number of read bytes; 0 means end of in buf reached
+ * n_in = 0 implies n_out = -1
+ *)
+ if n_out < 0 then
+ k_in, k_out, `Enc_utf8
+ else
+ scan (k_in + n_in) (k_out + n_out) (c_out + 1)
+ end
+ else
+ k_in, k_out, `Enc_utf8
+ in
+ scan 0 0 0
+;;
+
+
+let surrogate_offset = 0x10000 - (0xD800 lsl 10) - 0xDC00;;
+
+let read_utf16_le k_in_0 write s_in p_in l_in =
+ let rec scan k_in k_out c_out =
+ if k_in + 1 < l_in then begin
+ let p = (Char.code s_in.[p_in + k_in]) lor ((Char.code s_in.[p_in + k_in + 1]) lsl 8) in
+
+ if p >= 0xd800 & p < 0xe000 then begin
+ (* This is a surrogate pair. *)
+ if k_in + 3 < l_in then begin
+ if p <= 0xdbff then begin
+ let q = (Char.code s_in.[p_in + k_in + 2 ]) lor
+ ((Char.code s_in.[p_in + k_in + 3]) lsl 8) in
+ if q < 0xdc00 or q > 0xdfff then raise Malformed_code;
+ let eff_p = (p lsl 10) + q + surrogate_offset in
+ let n = write eff_p k_out c_out in
+ if n < 0 then
+ k_in, k_out, `Enc_utf16_le
+ else
+ scan (k_in + 4) (k_out + n) (c_out + 1)
+ end
+ else
+ (* Malformed pair: *)
+ raise Malformed_code;
+ end
+ else
+ (* Incomplete pair: *)
+ k_in, k_out, `Enc_utf16_le
+ end
+
+ else
+ if p = 0xfffe then
+ (* Big endian byte order mark: It is illegal here *)
+ raise Malformed_code
+ else begin
+ (* A regular code point *)
+ let n = write p k_out c_out in
+ if n < 0 then
+ k_in, k_out, `Enc_utf16_le
+ else
+ scan (k_in + 2) (k_out + n) (c_out + 1)
+ end
+ end
+ else
+ (* Incomplete character: *)
+ k_in, k_out, `Enc_utf16_le
+ in
+ scan k_in_0 0 0
+;;
+
+
+let read_utf16_be k_in_0 write s_in p_in l_in =
+ let rec scan k_in k_out c_out =
+ if k_in + 1 < l_in then begin
+ let p = (Char.code s_in.[p_in + k_in + 1]) lor ((Char.code s_in.[p_in + k_in]) lsl 8) in
+
+ if p >= 0xd800 & p < 0xe000 then begin
+ (* This is a surrogate pair. *)
+ if k_in + 3 < l_in then begin
+ if p <= 0xdbff then begin
+ let q = (Char.code s_in.[p_in + k_in + 3 ]) lor
+ ((Char.code s_in.[p_in + k_in + 2]) lsl 8) in
+ if q < 0xdc00 or q > 0xdfff then raise Malformed_code;
+ let eff_p = (p lsl 10) + q + surrogate_offset in
+ let n = write eff_p k_out c_out in
+ if n < 0 then
+ k_in, k_out, `Enc_utf16_be
+ else
+ scan (k_in + 4) (k_out + n) (c_out + 1)
+ end
+ else
+ (* Malformed pair: *)
+ raise Malformed_code;
+ end
+ else
+ (* Incomplete pair: *)
+ k_in, k_out, `Enc_utf16_be
+ end
+
+ else
+ if p = 0xfffe then
+ (* Little endian byte order mark: It is illegal here *)
+ raise Malformed_code
+ else begin
+ (* A regular code point *)
+ let n = write p k_out c_out in
+ if n < 0 then
+ k_in, k_out, `Enc_utf16_be
+ else
+ scan (k_in + 2) (k_out + n) (c_out + 1)
+ end
+
+ end
+ else
+ (* Incomplete character: *)
+ k_in, k_out, `Enc_utf16_be
+ in
+ scan k_in_0 0 0
+;;
+
+
+let read_utf16 write s_in p_in l_in =
+ (* Expect a BOM at the beginning of the text *)
+ if l_in >= 2 then begin
+ let c0 = s_in.[p_in + 0] in
+ let c1 = s_in.[p_in + 1] in
+ if c0 = '\254' & c1 = '\255' then begin
+ (* 0xfeff as big endian *)
+ read_utf16_be 2 write s_in p_in l_in
+ end
+ else
+ if c0 = '\255' & c1 = '\254' then begin
+ (* 0xfeff as little endian *)
+ read_utf16_le 2 write s_in p_in l_in
+ end
+ else
+ (* byte order mark missing *)
+ raise Malformed_code
+ end
+ else
+ 0, 0, `Enc_utf16
+;;
+
+
+let write_iso88591 s_out p_out l_out max_chars w p k_out c_out =
+ if k_out < l_out && c_out < max_chars then begin
+ if p > 255 then begin
+ let subst = w p in
+ let l_subst = String.length subst in
+ if k_out + l_subst <= l_out then begin
+ (* Enough space to store 'subst': *)
+ String.blit subst 0 s_out (k_out+p_out) l_subst;
+ l_subst
+ end
+ else
+ (* Not enough space: Stop this round of recoding *)
+ -1
+ end
+ else begin
+ s_out.[p_out + k_out] <- Char.chr p;
+ 1
+ end
+ end
+ else
+ -1 (* End-of-buffer indicator *)
+;;
+
+
+let write_usascii s_out p_out l_out max_chars w p k_out c_out =
+ if k_out < l_out && c_out < max_chars then begin
+ if p > 127 then begin
+ let subst = w p in
+ let l_subst = String.length subst in
+ if k_out + l_subst <= l_out then begin
+ (* Enough space to store 'subst': *)
+ String.blit subst 0 s_out (k_out+p_out) l_subst;
+ l_subst
+ end
+ else
+ (* Not enough space: Stop this round of recoding *)
+ -1
+ end
+ else begin
+ s_out.[p_out + k_out] <- Char.chr p;
+ 1
+ end
+ end
+ else
+ -1 (* End-of-buffer indicator *)
+;;
+
+
+let write_8bit from_unicode s_out p_out l_out max_chars w p k_out c_out =
+ if k_out < l_out && c_out < max_chars then begin
+ let p' =
+ match Array.unsafe_get from_unicode (p land 255) with
+ Netmappings.U_nil -> -1
+ | Netmappings.U_single (p0,q0) ->
+ if p0 = p then q0 else -1
+ | Netmappings.U_list l ->
+ (try List.assoc p l with Not_found -> -1)
+ in
+ if p' < 0 then begin
+ let subst = w p in
+ let l_subst = String.length subst in
+ if k_out + l_subst <= l_out then begin
+ (* Enough space to store 'subst': *)
+ String.blit subst 0 s_out (k_out+p_out) l_subst;
+ l_subst
+ end
+ else
+ (* Not enough space: Stop this round of recoding *)
+ -1
+ end
+ else begin
+ s_out.[p_out + k_out] <- Char.chr p';
+ 1
+ end
+ end
+ else
+ -1 (* End-of-buffer indicator *)
+;;
+
+
+let write_utf8 is_java s_out p_out l_out max_chars w p k_out c_out =
+ if p <= 127 && (not is_java || p <> 0) then begin
+ if k_out < l_out && c_out < max_chars then begin
+ s_out.[p_out + k_out] <- Char.chr p;
+ 1
+ end
+ else -1
+ end
+ else if p <= 0x7ff then begin
+ if k_out + 1 < l_out && c_out < max_chars then begin
+ s_out.[p_out + k_out] <- Char.chr (0xc0 lor (p lsr 6));
+ s_out.[p_out + k_out + 1] <- Char.chr (0x80 lor (p land 0x3f));
+ 2
+ end
+ else -1
+ end
+ else if p <= 0xffff then begin
+ (* Refuse writing surrogate pairs, and fffe, ffff *)
+ if (p >= 0xd800 & p < 0xe000) or (p >= 0xfffe) then
+ failwith "Netconversion.write_utf8";
+ if k_out + 2 < l_out && c_out < max_chars then begin
+ s_out.[p_out + k_out] <- Char.chr (0xe0 lor (p lsr 12));
+ s_out.[p_out + k_out + 1] <- Char.chr (0x80 lor ((p lsr 6) land 0x3f));
+ s_out.[p_out + k_out + 2] <- Char.chr (0x80 lor (p land 0x3f));
+ 3
+ end
+ else -1
+ end
+ else if p <= 0x10ffff then begin
+ if k_out + 3 < l_out && c_out < max_chars then begin
+ s_out.[p_out + k_out] <- Char.chr (0xf0 lor (p lsr 18));
+ s_out.[p_out + k_out + 1] <- Char.chr (0x80 lor ((p lsr 12) land 0x3f));
+ s_out.[p_out + k_out + 2] <- Char.chr (0x80 lor ((p lsr 6) land 0x3f));
+ s_out.[p_out + k_out + 3] <- Char.chr (0x80 lor (p land 0x3f));
+ 4
+ end
+ else -1
+ end
+ else
+ (* Higher code points are not possible in XML: *)
+ failwith "Netconversion.write_utf8"
+;;
+
+
+let write_utf16_le s_out p_out l_out max_chars w p k_out c_out =
+ if p >= 0xfffe then begin
+ if p <= 0xffff or p > 0x10ffff then failwith "Netconversion.write_utf16_le";
+ (* Must be written as surrogate pair *)
+ if k_out + 3 < l_out && c_out < max_chars then begin
+ let high = (p lsr 10) + 0xd800 in
+ let low = (p land 0x3ff) + 0xdc00 in
+ s_out.[p_out + k_out ] <- Char.chr (high land 0xff);
+ s_out.[p_out + k_out + 1] <- Char.chr (high lsr 8);
+ s_out.[p_out + k_out + 2] <- Char.chr (low land 0xff);
+ s_out.[p_out + k_out + 3] <- Char.chr (low lsr 8);
+ 4
+ end
+ else -1
+ end
+ else begin
+ if k_out + 1 < l_out && c_out < max_chars then begin
+ s_out.[p_out + k_out ] <- Char.chr (p land 0xff);
+ s_out.[p_out + k_out + 1] <- Char.chr (p lsr 8);
+ 2
+ end
+ else
+ -1
+ end
+;;
+
+
+let write_utf16_be s_out p_out l_out max_chars w p k_out c_out =
+ if p >= 0xfffe then begin
+ if p <= 0xffff or p > 0x10ffff then failwith "Netconversion.write_utf16_be";
+ (* Must be written as surrogate pair *)
+ if k_out + 3 < l_out && c_out < max_chars then begin
+ let high = (p lsr 10) + 0xd800 in
+ let low = (p land 0x3ff) + 0xdc00 in
+ s_out.[p_out + k_out + 1] <- Char.chr (high land 0xff);
+ s_out.[p_out + k_out ] <- Char.chr (high lsr 8);
+ s_out.[p_out + k_out + 3] <- Char.chr (low land 0xff);
+ s_out.[p_out + k_out + 2] <- Char.chr (low lsr 8);
+ 4
+ end
+ else -1
+ end
+ else begin
+ if k_out + 1 < l_out && c_out < max_chars then begin
+ s_out.[p_out + k_out + 1] <- Char.chr (p land 0xff);
+ s_out.[p_out + k_out ] <- Char.chr (p lsr 8);
+ 2
+ end
+ else
+ -1
+ end
+;;
+
+
+let recode ~in_enc
+ ~in_buf
+ ~in_pos
+ ~in_len
+ ~out_enc
+ ~out_buf
+ ~out_pos
+ ~out_len
+ ~max_chars
+ ~subst =
+ if (in_pos < 0 || in_len < 0 || in_pos + in_len > String.length in_buf ||
+ out_pos < 0 || out_len < 0 || out_pos + out_len > String.length out_buf)
+ then
+ invalid_arg "Netconversion.recode";
+
+ let reader =
+ match in_enc with
+ `Enc_iso88591 -> read_iso88591
+ | `Enc_usascii -> read_usascii
+ | `Enc_utf8 -> read_utf8 false
+ | `Enc_java -> read_utf8 true
+ | `Enc_utf16 -> read_utf16
+ | `Enc_utf16_le -> read_utf16_le 0
+ | `Enc_utf16_be -> read_utf16_be 0
+ | _ ->
+ (try
+ let to_unicode' = Hashtbl.find Netmappings.to_unicode in_enc in
+ let to_unicode =
+ Netmappings.lock();
+ Lazy.force to_unicode' in
+ Netmappings.unlock();
+ read_8bit to_unicode in_enc
+ with
+ Not_found ->
+ failwith("Support for the encoding `" ^
+ string_of_encoding in_enc ^
+ "' has not been compiled into Netstring")
+ )
+ in
+ let writer =
+ match out_enc with
+ `Enc_iso88591 -> write_iso88591 out_buf out_pos out_len max_chars subst
+ | `Enc_usascii -> write_usascii out_buf out_pos out_len max_chars subst
+ | `Enc_utf8 -> write_utf8 false
+ out_buf out_pos out_len max_chars subst
+ | `Enc_java -> write_utf8 true out_buf out_pos out_len max_chars subst
+ | `Enc_utf16 -> failwith "Netconversion.recode"
+ | `Enc_utf16_le -> write_utf16_le out_buf out_pos out_len max_chars subst
+ | `Enc_utf16_be -> write_utf16_be out_buf out_pos out_len max_chars subst
+ | _ ->
+ (try
+ let from_unicode' = Hashtbl.find Netmappings.from_unicode out_enc
+ in
+ let from_unicode =
+ Netmappings.lock();
+ Lazy.force from_unicode' in
+ Netmappings.unlock();
+ write_8bit from_unicode out_buf out_pos out_len max_chars subst
+ with
+ Not_found ->
+ failwith("Support for the encoding `" ^
+ string_of_encoding out_enc ^
+ "' has not been compiled into Netstring")
+ )
+ in
+ reader writer in_buf in_pos in_len
+;;
+
+
+let makechar enc p =
+ match enc with
+ `Enc_iso88591 ->
+ if p > 255 then raise Not_found;
+ String.make 1 (Char.chr p)
+ | `Enc_usascii ->
+ if p > 127 then raise Not_found;
+ String.make 1 (Char.chr p)
+ | `Enc_utf8 ->
+ let s = String.create 4 in
+ let n = write_utf8 false s 0 4 1 (fun _ -> raise Not_found) p 0 0 in
+ String.sub s 0 n
+ | `Enc_java ->
+ let s = String.create 4 in
+ let n = write_utf8 true s 0 4 1 (fun _ -> raise Not_found) p 0 0 in
+ String.sub s 0 n
+ | `Enc_utf16_le ->
+ let s = String.create 4 in
+ let n = write_utf16_le s 0 4 1 (fun _ -> raise Not_found) p 0 0 in
+ String.sub s 0 n
+ | `Enc_utf16_be ->
+ let s = String.create 4 in
+ let n = write_utf16_be s 0 4 1 (fun _ -> raise Not_found) p 0 0 in
+ String.sub s 0 n
+ | `Enc_utf16 ->
+ failwith "Netconversion.makechar"
+ | _ ->
+ let s = String.create 1 in
+ let from_unicode' =
+ try
+ Hashtbl.find Netmappings.from_unicode enc
+ with
+ Not_found ->
+ failwith("Support for the encoding `" ^
+ string_of_encoding enc ^
+ "' has not been compiled into Netstring")
+ in
+ let from_unicode =
+ Netmappings.lock();
+ Lazy.force from_unicode' in
+ Netmappings.unlock();
+ let n =
+ write_8bit from_unicode s 0 1 1 (fun _ -> raise Not_found) p 0 0 in
+ s
+;;
+
+
+let recode_string ~in_enc ~out_enc ?(subst = (fun _ -> raise Not_found)) s =
+
+ let length = String.length s in
+ let size = 1024 in
+ let out_buf = String.create size in
+
+ let rec recode_loop k s_done in_enc =
+ (* 'k' bytes of 's' have already been processed, and the result is in
+ * 's_done'.
+ *)
+ (* Recode to 'out_buf': *)
+ let in_len = length - k in
+ let in_done, out_done, in_enc' =
+ recode ~in_enc:in_enc ~in_buf:s ~in_pos:k ~in_len:in_len
+ ~out_enc:out_enc ~out_buf:out_buf ~out_pos:0 ~out_len:size
+ ~max_chars:size ~subst:subst in
+ (* Collect the results: *)
+ let k' = k + in_done in
+ let s_done' = String.sub out_buf 0 out_done :: s_done in
+ (* Still something to do? *)
+ if k' < length then
+ recode_loop k' s_done' in_enc'
+ else
+ (* No: Concatenate s_done' to get the final result. *)
+ String.concat "" (List.rev s_done')
+ in
+
+ recode_loop 0 [] in_enc
+;;
+
+
+(* ======================================================================
+ * History:
+ *
+ * $Log$
+ * Revision 1.1 2000/11/17 09:57:28 lpadovan
+ * Initial revision
+ *
+ * Revision 1.2 2000/08/29 00:46:41 gerd
+ * New type for the Unicode to 8 bit translation table.
+ * The Netmappings tables are now Lazy.t.
+ *
+ * Revision 1.1 2000/08/13 00:02:57 gerd
+ * Initial revision.
+ *
+ *
+ * ======================================================================
+ * OLD LOGS FROM THE PXP PACKAGE (FILE NAME pxp_encoding.ml):
+ *
+ * Revision 1.5 2000/07/27 00:41:14 gerd
+ * new 8 bit codes
+ *
+ * Revision 1.4 2000/07/04 22:11:41 gerd
+ * Implemented the enhancements and extensions of
+ * rev. 1.4 of pxp_encoding.mli.
+ *
+ * Revision 1.3 2000/05/29 23:48:38 gerd
+ * Changed module names:
+ * Markup_aux into Pxp_aux
+ * Markup_codewriter into Pxp_codewriter
+ * Markup_document into Pxp_document
+ * Markup_dtd into Pxp_dtd
+ * Markup_entity into Pxp_entity
+ * Markup_lexer_types into Pxp_lexer_types
+ * Markup_reader into Pxp_reader
+ * Markup_types into Pxp_types
+ * Markup_yacc into Pxp_yacc
+ * See directory "compatibility" for (almost) compatible wrappers emulating
+ * Markup_document, Markup_dtd, Markup_reader, Markup_types, and Markup_yacc.
+ *
+ * Revision 1.2 2000/05/29 21:14:57 gerd
+ * Changed the type 'encoding' into a polymorphic variant.
+ *
+ * Revision 1.1 2000/05/20 20:30:50 gerd
+ * Initial revision.
+ *
+ *
+ *)