+++ /dev/null
-(* $Id$
- * ----------------------------------------------------------------------
- *)
-
-exception Malformed_code
-
-
-type encoding =
- [ `Enc_utf8 (* UTF-8 *)
- | `Enc_java
- | `Enc_utf16 (* UTF-16 with unspecified endianess (restricted usage) *)
- | `Enc_utf16_le (* UTF-16 little endian *)
- | `Enc_utf16_be (* UTF-16 big endian *)
- | `Enc_usascii (* US-ASCII (only 7 bit) *)
- | `Enc_iso88591 (* ISO-8859-1 *)
- | `Enc_iso88592 (* ISO-8859-2 *)
- | `Enc_iso88593 (* ISO-8859-3 *)
- | `Enc_iso88594 (* ISO-8859-4 *)
- | `Enc_iso88595 (* ISO-8859-5 *)
- | `Enc_iso88596 (* ISO-8859-6 *)
- | `Enc_iso88597 (* ISO-8859-7 *)
- | `Enc_iso88598 (* ISO-8859-8 *)
- | `Enc_iso88599 (* ISO-8859-9 *)
- | `Enc_iso885910 (* ISO-8859-10 *)
- | `Enc_iso885913 (* ISO-8859-13 *)
- | `Enc_iso885914 (* ISO-8859-14 *)
- | `Enc_iso885915 (* ISO-8859-15 *)
- | `Enc_koi8r (* KOI8-R *)
- | `Enc_jis0201 (* JIS-0201 *)
- (* Microsoft: *)
- | `Enc_windows1250 (* WINDOWS-1250 *)
- | `Enc_windows1251 (* WINDOWS-1251 *)
- | `Enc_windows1252 (* WINDOWS-1252 *)
- | `Enc_windows1253 (* WINDOWS-1253 *)
- | `Enc_windows1254 (* WINDOWS-1254 *)
- | `Enc_windows1255 (* WINDOWS-1255 *)
- | `Enc_windows1256 (* WINDOWS-1256 *)
- | `Enc_windows1257 (* WINDOWS-1257 *)
- | `Enc_windows1258 (* WINDOWS-1258 *)
- (* IBM, ASCII-based: *)
- | `Enc_cp437
- | `Enc_cp737
- | `Enc_cp775
- | `Enc_cp850
- | `Enc_cp852
- | `Enc_cp855
- | `Enc_cp856
- | `Enc_cp857
- | `Enc_cp860
- | `Enc_cp861
- | `Enc_cp862
- | `Enc_cp863
- | `Enc_cp864
- | `Enc_cp865
- | `Enc_cp866
- | `Enc_cp869
- | `Enc_cp874
- | `Enc_cp1006
- (* IBM, EBCDIC-based: *)
- | `Enc_cp037
- | `Enc_cp424
- | `Enc_cp500
- | `Enc_cp875
- | `Enc_cp1026
- (* Adobe: *)
- | `Enc_adobe_standard_encoding
- | `Enc_adobe_symbol_encoding
- | `Enc_adobe_zapf_dingbats_encoding
- (* Apple: *)
- | `Enc_macroman
-
- ]
-;;
-
-
-let norm_enc_name e =
- (* Removes some characters from e; uppercase *)
- let e' = String.create (String.length e) in
- let rec next i j =
- if i < String.length e then
- match e.[i] with
- ('-'|'_'|'.') -> next (i+1) j
- | c -> e'.[j] <- c; next (i+1) (j+1)
- else
- j
- in
- let l = next 0 0 in
- String.uppercase(String.sub e' 0 l)
-;;
-
-
-let encoding_of_string e =
- match norm_enc_name e with
- ("UTF16"|"UCS2"|"ISO10646UCS2") -> `Enc_utf16
- | "UTF16BE" -> `Enc_utf16_be
- | "UTF16LE" -> `Enc_utf16_le
- | "UTF8" -> `Enc_utf8
- | ("UTF8JAVA"|"JAVA") -> `Enc_java
- | ("USASCII"|"ASCII"|"ISO646US"|"IBM367"|"CP367") -> `Enc_usascii
- | ("ISO88591"|"LATIN1"|"IBM819"|"CP819") -> `Enc_iso88591
- | ("ISO88592"|"LATIN2") -> `Enc_iso88592
- | ("ISO88593"|"LATIN3") -> `Enc_iso88593
- | ("ISO88594"|"LATIN4") -> `Enc_iso88594
- | ("ISO88595"|"CYRILLIC") -> `Enc_iso88595
- | ("ISO88596"|"ARABIC"|"ECMA114"|"ASMO708") -> `Enc_iso88596
- | ("ISO88597"|"GREEK"|"GREEK8"|"ELOT928"|"ECMA118") -> `Enc_iso88597
- | ("ISO88598"|"HEBREW") -> `Enc_iso88598
- | ("ISO88599"|"LATIN5") -> `Enc_iso88599
- | ("ISO885910"|"LATIN6") -> `Enc_iso885910
- | "ISO885913" -> `Enc_iso885913
- | "ISO885914" -> `Enc_iso885914
- | "ISO885915" -> `Enc_iso885915
- | "KOI8R" -> `Enc_koi8r
- | ("JIS0201"|"JISX0201"|"X0201") -> `Enc_jis0201
-
- | "WINDOWS1250" -> `Enc_windows1250
- | "WINDOWS1251" -> `Enc_windows1251
- | "WINDOWS1252" -> `Enc_windows1252
- | "WINDOWS1253" -> `Enc_windows1253
- | "WINDOWS1254" -> `Enc_windows1254
- | "WINDOWS1255" -> `Enc_windows1255
- | "WINDOWS1256" -> `Enc_windows1256
- | "WINDOWS1257" -> `Enc_windows1257
- | "WINDOWS1258" -> `Enc_windows1258
-
- | ("CP437"|"IBM437") -> `Enc_cp437
- | ("CP737"|"IBM737") -> `Enc_cp737
- | ("CP775"|"IBM775") -> `Enc_cp775
- | ("CP850"|"IBM850") -> `Enc_cp850
- | ("CP852"|"IBM852") -> `Enc_cp852
- | ("CP855"|"IBM855") -> `Enc_cp855
- | ("CP856"|"IBM856") -> `Enc_cp856
- | ("CP857"|"IBM857") -> `Enc_cp857
- | ("CP860"|"IBM860") -> `Enc_cp860
- | ("CP861"|"IBM861") -> `Enc_cp861
- | ("CP862"|"IBM862") -> `Enc_cp862
- | ("CP863"|"IBM863") -> `Enc_cp863
- | ("CP864"|"IBM864") -> `Enc_cp864
- | ("CP865"|"IBM865") -> `Enc_cp865
- | ("CP866"|"IBM866") -> `Enc_cp866
- | ("CP869"|"IBM869") -> `Enc_cp869
- | ("CP874"|"IBM874") -> `Enc_cp874
- | ("CP1006"|"IBM1006") -> `Enc_cp1006
-
- | ("CP037"|"IBM037"|"EBCDICCPUS"|"EBCDICCPCA"|"EBCDICCPWT"|
- "EBCDICCPNL") -> `Enc_cp037
- | ("CP424"|"IBM424"|"EBCDICCPHE") -> `Enc_cp424
- | ("CP500"|"IBM500"|"EBCDICCPBE"|"EBCDICCPCH") -> `Enc_cp500
- | ("CP875"|"IBM875") -> `Enc_cp875
- | ("CP1026"|"IBM1026") -> `Enc_cp1026
-
- | "ADOBESTANDARDENCODING" -> `Enc_adobe_standard_encoding
- | "ADOBESYMBOLENCODING" -> `Enc_adobe_symbol_encoding
- | "ADOBEZAPFDINGBATSENCODING" -> `Enc_adobe_zapf_dingbats_encoding
-
- | "MACINTOSH" -> `Enc_macroman
-
- | _ ->
- failwith "Netconversion.encoding_of_string: unknown encoding"
-;;
-
-
-let string_of_encoding (e : encoding) =
- (* If there is a "preferred MIME name", this name is returned (see IANA). *)
- match e with
- `Enc_utf16 -> "UTF-16"
- | `Enc_utf16_be -> "UTF-16-BE"
- | `Enc_utf16_le -> "UTF-16-LE"
- | `Enc_utf8 -> "UTF-8"
- | `Enc_java -> "UTF-8-JAVA"
- | `Enc_usascii -> "US-ASCII"
- | `Enc_iso88591 -> "ISO-8859-1"
- | `Enc_iso88592 -> "ISO-8859-2"
- | `Enc_iso88593 -> "ISO-8859-3"
- | `Enc_iso88594 -> "ISO-8859-4"
- | `Enc_iso88595 -> "ISO-8859-5"
- | `Enc_iso88596 -> "ISO-8859-6"
- | `Enc_iso88597 -> "ISO-8859-7"
- | `Enc_iso88598 -> "ISO-8859-8"
- | `Enc_iso88599 -> "ISO-8859-9"
- | `Enc_iso885910 -> "ISO-8859-10"
- | `Enc_iso885913 -> "ISO-8859-13"
- | `Enc_iso885914 -> "ISO-8859-14"
- | `Enc_iso885915 -> "ISO-8859-15"
- | `Enc_koi8r -> "KOI8-R"
- | `Enc_jis0201 -> "JIS_X0201"
- | `Enc_windows1250 -> "WINDOWS-1250"
- | `Enc_windows1251 -> "WINDOWS-1251"
- | `Enc_windows1252 -> "WINDOWS-1252"
- | `Enc_windows1253 -> "WINDOWS-1253"
- | `Enc_windows1254 -> "WINDOWS-1254"
- | `Enc_windows1255 -> "WINDOWS-1255"
- | `Enc_windows1256 -> "WINDOWS-1256"
- | `Enc_windows1257 -> "WINDOWS-1257"
- | `Enc_windows1258 -> "WINDOWS-1258"
- | `Enc_cp437 -> "CP437"
- | `Enc_cp737 -> "CP737"
- | `Enc_cp775 -> "CP775"
- | `Enc_cp850 -> "CP850"
- | `Enc_cp852 -> "CP852"
- | `Enc_cp855 -> "CP855"
- | `Enc_cp856 -> "CP856"
- | `Enc_cp857 -> "CP857"
- | `Enc_cp860 -> "CP860"
- | `Enc_cp861 -> "CP861"
- | `Enc_cp862 -> "CP862"
- | `Enc_cp863 -> "CP863"
- | `Enc_cp864 -> "CP864"
- | `Enc_cp865 -> "CP865"
- | `Enc_cp866 -> "CP866"
- | `Enc_cp869 -> "CP869"
- | `Enc_cp874 -> "CP874"
- | `Enc_cp1006 -> "CP1006"
- | `Enc_cp037 -> "CP037"
- | `Enc_cp424 -> "CP424"
- | `Enc_cp500 -> "CP500"
- | `Enc_cp875 -> "CP875"
- | `Enc_cp1026 -> "CP1026"
- | `Enc_adobe_standard_encoding -> "ADOBE-STANDARD-ENCODING"
- | `Enc_adobe_symbol_encoding -> "ADOBE-SYMBOL-ENCODING"
- | `Enc_adobe_zapf_dingbats_encoding -> "ADOBE-ZAPF-DINGBATS-ENCODING"
- | `Enc_macroman -> "MACINTOSH"
-;;
-
-
-let read_iso88591 write s_in p_in l_in =
- let rec scan k_in k_out c_out =
- if k_in < l_in then begin
- let p = Char.code s_in.[p_in + k_in] in
- let n = write p k_out c_out in
- if n < 0 then
- k_in, k_out, `Enc_iso88591
- else
- scan (k_in + 1) (k_out + n) (c_out + 1)
- end
- else
- k_in, k_out, `Enc_iso88591
- in
- scan 0 0 0
-;;
-
-
-let read_usascii write s_in p_in l_in =
- let rec scan k_in k_out c_out =
- if k_in < l_in then begin
- let p = Char.code s_in.[p_in + k_in] in
- if p >= 0x80 then raise Malformed_code;
- let n = write p k_out c_out in
- if n < 0 then
- k_in, k_out, `Enc_usascii
- else
- scan (k_in + 1) (k_out + n) (c_out + 1)
- end
- else
- k_in, k_out, `Enc_usascii
- in
- scan 0 0 0
-;;
-
-
-let read_8bit m_to_unicode enc write s_in p_in l_in =
- let rec scan k_in k_out c_out =
- if k_in < l_in then begin
- let p_local = Char.code s_in.[p_in + k_in] in
- let p_uni = Array.unsafe_get m_to_unicode p_local in
- if p_uni < 0 then raise Malformed_code;
- let n = write p_uni k_out c_out in
- if n < 0 then
- k_in, k_out, enc
- else
- scan (k_in + 1) (k_out + n) (c_out + 1)
- end
- else
- k_in, k_out, enc
- in
- scan 0 0 0
-;;
-
-
-let read_utf8 is_java write s_in p_in l_in =
- let rec scan k_in k_out c_out =
- if k_in < l_in then begin
- let n_out, n_in =
- match s_in.[p_in + k_in] with
- '\000' ->
- if is_java then raise Malformed_code;
- write 0 k_out c_out, 1
- | ('\001'..'\127' as c) ->
- write (Char.code c) k_out c_out, 1
- | ('\128'..'\223' as c) ->
- if k_in + 1 >= l_in then
- -1, 0
- else begin
- let n1 = Char.code c in
- let n2 = Char.code (s_in.[p_in + k_in + 1]) in
- if is_java && (n1 = 0x80 && n2 = 0xc0) then
- write 0 k_out c_out, 2
- else begin
- if n2 < 128 or n2 > 191 then raise Malformed_code;
- let p = ((n1 land 0b11111) lsl 6) lor (n2 land 0b111111) in
- if p < 128 then raise Malformed_code;
- write p k_out c_out, 2
- end
- end
- | ('\224'..'\239' as c) ->
- if k_in + 2 >= l_in then
- -1, 0
- else begin
- let n1 = Char.code c in
- let n2 = Char.code (s_in.[p_in + k_in + 1]) in
- let n3 = Char.code (s_in.[p_in + k_in + 2]) in
- if n2 < 128 or n2 > 191 then raise Malformed_code;
- if n3 < 128 or n3 > 191 then raise Malformed_code;
- let p =
- ((n1 land 0b1111) lsl 12) lor
- ((n2 land 0b111111) lsl 6) lor
- (n3 land 0b111111)
- in
- if p < 0x800 then raise Malformed_code;
- if (p >= 0xd800 && p < 0xe000) then
- (* Surrogate pairs are not supported in UTF-8 *)
- raise Malformed_code;
- if (p >= 0xfffe && p <= 0xffff) then
- raise Malformed_code;
- write p k_out c_out, 3
- end
- | ('\240'..'\247' as c) ->
- if k_in + 3 >= l_in then
- -1, 0
- else begin
- let n1 = Char.code c in
- let n2 = Char.code (s_in.[p_in + k_in + 1]) in
- let n3 = Char.code (s_in.[p_in + k_in + 2]) in
- let n4 = Char.code (s_in.[p_in + k_in + 3]) in
- if n2 < 128 or n2 > 191 then raise Malformed_code;
- if n3 < 128 or n3 > 191 then raise Malformed_code;
- if n4 < 128 or n4 > 191 then raise Malformed_code;
- let p = ((n1 land 0b111) lsl 18) lor
- ((n2 land 0b111111) lsl 12) lor
- ((n3 land 0b111111) lsl 6) lor
- (n4 land 0b111111)
- in
- if p < 0x10000 then raise Malformed_code;
- if p >= 0x110000 then
- (* These code points are not supported. *)
- raise Malformed_code;
- write p k_out c_out, 4
- end
- | _ ->
- (* Outside the valid range of XML characters *)
- raise Malformed_code;
- in
- (* n_out: number of written bytes; -1 means out buf is full
- * n_in: number of read bytes; 0 means end of in buf reached
- * n_in = 0 implies n_out = -1
- *)
- if n_out < 0 then
- k_in, k_out, `Enc_utf8
- else
- scan (k_in + n_in) (k_out + n_out) (c_out + 1)
- end
- else
- k_in, k_out, `Enc_utf8
- in
- scan 0 0 0
-;;
-
-
-let surrogate_offset = 0x10000 - (0xD800 lsl 10) - 0xDC00;;
-
-let read_utf16_le k_in_0 write s_in p_in l_in =
- let rec scan k_in k_out c_out =
- if k_in + 1 < l_in then begin
- let p = (Char.code s_in.[p_in + k_in]) lor ((Char.code s_in.[p_in + k_in + 1]) lsl 8) in
-
- if p >= 0xd800 & p < 0xe000 then begin
- (* This is a surrogate pair. *)
- if k_in + 3 < l_in then begin
- if p <= 0xdbff then begin
- let q = (Char.code s_in.[p_in + k_in + 2 ]) lor
- ((Char.code s_in.[p_in + k_in + 3]) lsl 8) in
- if q < 0xdc00 or q > 0xdfff then raise Malformed_code;
- let eff_p = (p lsl 10) + q + surrogate_offset in
- let n = write eff_p k_out c_out in
- if n < 0 then
- k_in, k_out, `Enc_utf16_le
- else
- scan (k_in + 4) (k_out + n) (c_out + 1)
- end
- else
- (* Malformed pair: *)
- raise Malformed_code;
- end
- else
- (* Incomplete pair: *)
- k_in, k_out, `Enc_utf16_le
- end
-
- else
- if p = 0xfffe then
- (* Big endian byte order mark: It is illegal here *)
- raise Malformed_code
- else begin
- (* A regular code point *)
- let n = write p k_out c_out in
- if n < 0 then
- k_in, k_out, `Enc_utf16_le
- else
- scan (k_in + 2) (k_out + n) (c_out + 1)
- end
- end
- else
- (* Incomplete character: *)
- k_in, k_out, `Enc_utf16_le
- in
- scan k_in_0 0 0
-;;
-
-
-let read_utf16_be k_in_0 write s_in p_in l_in =
- let rec scan k_in k_out c_out =
- if k_in + 1 < l_in then begin
- let p = (Char.code s_in.[p_in + k_in + 1]) lor ((Char.code s_in.[p_in + k_in]) lsl 8) in
-
- if p >= 0xd800 & p < 0xe000 then begin
- (* This is a surrogate pair. *)
- if k_in + 3 < l_in then begin
- if p <= 0xdbff then begin
- let q = (Char.code s_in.[p_in + k_in + 3 ]) lor
- ((Char.code s_in.[p_in + k_in + 2]) lsl 8) in
- if q < 0xdc00 or q > 0xdfff then raise Malformed_code;
- let eff_p = (p lsl 10) + q + surrogate_offset in
- let n = write eff_p k_out c_out in
- if n < 0 then
- k_in, k_out, `Enc_utf16_be
- else
- scan (k_in + 4) (k_out + n) (c_out + 1)
- end
- else
- (* Malformed pair: *)
- raise Malformed_code;
- end
- else
- (* Incomplete pair: *)
- k_in, k_out, `Enc_utf16_be
- end
-
- else
- if p = 0xfffe then
- (* Little endian byte order mark: It is illegal here *)
- raise Malformed_code
- else begin
- (* A regular code point *)
- let n = write p k_out c_out in
- if n < 0 then
- k_in, k_out, `Enc_utf16_be
- else
- scan (k_in + 2) (k_out + n) (c_out + 1)
- end
-
- end
- else
- (* Incomplete character: *)
- k_in, k_out, `Enc_utf16_be
- in
- scan k_in_0 0 0
-;;
-
-
-let read_utf16 write s_in p_in l_in =
- (* Expect a BOM at the beginning of the text *)
- if l_in >= 2 then begin
- let c0 = s_in.[p_in + 0] in
- let c1 = s_in.[p_in + 1] in
- if c0 = '\254' & c1 = '\255' then begin
- (* 0xfeff as big endian *)
- read_utf16_be 2 write s_in p_in l_in
- end
- else
- if c0 = '\255' & c1 = '\254' then begin
- (* 0xfeff as little endian *)
- read_utf16_le 2 write s_in p_in l_in
- end
- else
- (* byte order mark missing *)
- raise Malformed_code
- end
- else
- 0, 0, `Enc_utf16
-;;
-
-
-let write_iso88591 s_out p_out l_out max_chars w p k_out c_out =
- if k_out < l_out && c_out < max_chars then begin
- if p > 255 then begin
- let subst = w p in
- let l_subst = String.length subst in
- if k_out + l_subst <= l_out then begin
- (* Enough space to store 'subst': *)
- String.blit subst 0 s_out (k_out+p_out) l_subst;
- l_subst
- end
- else
- (* Not enough space: Stop this round of recoding *)
- -1
- end
- else begin
- s_out.[p_out + k_out] <- Char.chr p;
- 1
- end
- end
- else
- -1 (* End-of-buffer indicator *)
-;;
-
-
-let write_usascii s_out p_out l_out max_chars w p k_out c_out =
- if k_out < l_out && c_out < max_chars then begin
- if p > 127 then begin
- let subst = w p in
- let l_subst = String.length subst in
- if k_out + l_subst <= l_out then begin
- (* Enough space to store 'subst': *)
- String.blit subst 0 s_out (k_out+p_out) l_subst;
- l_subst
- end
- else
- (* Not enough space: Stop this round of recoding *)
- -1
- end
- else begin
- s_out.[p_out + k_out] <- Char.chr p;
- 1
- end
- end
- else
- -1 (* End-of-buffer indicator *)
-;;
-
-
-let write_8bit from_unicode s_out p_out l_out max_chars w p k_out c_out =
- if k_out < l_out && c_out < max_chars then begin
- let p' =
- match Array.unsafe_get from_unicode (p land 255) with
- Netmappings.U_nil -> -1
- | Netmappings.U_single (p0,q0) ->
- if p0 = p then q0 else -1
- | Netmappings.U_list l ->
- (try List.assoc p l with Not_found -> -1)
- in
- if p' < 0 then begin
- let subst = w p in
- let l_subst = String.length subst in
- if k_out + l_subst <= l_out then begin
- (* Enough space to store 'subst': *)
- String.blit subst 0 s_out (k_out+p_out) l_subst;
- l_subst
- end
- else
- (* Not enough space: Stop this round of recoding *)
- -1
- end
- else begin
- s_out.[p_out + k_out] <- Char.chr p';
- 1
- end
- end
- else
- -1 (* End-of-buffer indicator *)
-;;
-
-
-let write_utf8 is_java s_out p_out l_out max_chars w p k_out c_out =
- if p <= 127 && (not is_java || p <> 0) then begin
- if k_out < l_out && c_out < max_chars then begin
- s_out.[p_out + k_out] <- Char.chr p;
- 1
- end
- else -1
- end
- else if p <= 0x7ff then begin
- if k_out + 1 < l_out && c_out < max_chars then begin
- s_out.[p_out + k_out] <- Char.chr (0xc0 lor (p lsr 6));
- s_out.[p_out + k_out + 1] <- Char.chr (0x80 lor (p land 0x3f));
- 2
- end
- else -1
- end
- else if p <= 0xffff then begin
- (* Refuse writing surrogate pairs, and fffe, ffff *)
- if (p >= 0xd800 & p < 0xe000) or (p >= 0xfffe) then
- failwith "Netconversion.write_utf8";
- if k_out + 2 < l_out && c_out < max_chars then begin
- s_out.[p_out + k_out] <- Char.chr (0xe0 lor (p lsr 12));
- s_out.[p_out + k_out + 1] <- Char.chr (0x80 lor ((p lsr 6) land 0x3f));
- s_out.[p_out + k_out + 2] <- Char.chr (0x80 lor (p land 0x3f));
- 3
- end
- else -1
- end
- else if p <= 0x10ffff then begin
- if k_out + 3 < l_out && c_out < max_chars then begin
- s_out.[p_out + k_out] <- Char.chr (0xf0 lor (p lsr 18));
- s_out.[p_out + k_out + 1] <- Char.chr (0x80 lor ((p lsr 12) land 0x3f));
- s_out.[p_out + k_out + 2] <- Char.chr (0x80 lor ((p lsr 6) land 0x3f));
- s_out.[p_out + k_out + 3] <- Char.chr (0x80 lor (p land 0x3f));
- 4
- end
- else -1
- end
- else
- (* Higher code points are not possible in XML: *)
- failwith "Netconversion.write_utf8"
-;;
-
-
-let write_utf16_le s_out p_out l_out max_chars w p k_out c_out =
- if p >= 0xfffe then begin
- if p <= 0xffff or p > 0x10ffff then failwith "Netconversion.write_utf16_le";
- (* Must be written as surrogate pair *)
- if k_out + 3 < l_out && c_out < max_chars then begin
- let high = (p lsr 10) + 0xd800 in
- let low = (p land 0x3ff) + 0xdc00 in
- s_out.[p_out + k_out ] <- Char.chr (high land 0xff);
- s_out.[p_out + k_out + 1] <- Char.chr (high lsr 8);
- s_out.[p_out + k_out + 2] <- Char.chr (low land 0xff);
- s_out.[p_out + k_out + 3] <- Char.chr (low lsr 8);
- 4
- end
- else -1
- end
- else begin
- if k_out + 1 < l_out && c_out < max_chars then begin
- s_out.[p_out + k_out ] <- Char.chr (p land 0xff);
- s_out.[p_out + k_out + 1] <- Char.chr (p lsr 8);
- 2
- end
- else
- -1
- end
-;;
-
-
-let write_utf16_be s_out p_out l_out max_chars w p k_out c_out =
- if p >= 0xfffe then begin
- if p <= 0xffff or p > 0x10ffff then failwith "Netconversion.write_utf16_be";
- (* Must be written as surrogate pair *)
- if k_out + 3 < l_out && c_out < max_chars then begin
- let high = (p lsr 10) + 0xd800 in
- let low = (p land 0x3ff) + 0xdc00 in
- s_out.[p_out + k_out + 1] <- Char.chr (high land 0xff);
- s_out.[p_out + k_out ] <- Char.chr (high lsr 8);
- s_out.[p_out + k_out + 3] <- Char.chr (low land 0xff);
- s_out.[p_out + k_out + 2] <- Char.chr (low lsr 8);
- 4
- end
- else -1
- end
- else begin
- if k_out + 1 < l_out && c_out < max_chars then begin
- s_out.[p_out + k_out + 1] <- Char.chr (p land 0xff);
- s_out.[p_out + k_out ] <- Char.chr (p lsr 8);
- 2
- end
- else
- -1
- end
-;;
-
-
-let recode ~in_enc
- ~in_buf
- ~in_pos
- ~in_len
- ~out_enc
- ~out_buf
- ~out_pos
- ~out_len
- ~max_chars
- ~subst =
- if (in_pos < 0 || in_len < 0 || in_pos + in_len > String.length in_buf ||
- out_pos < 0 || out_len < 0 || out_pos + out_len > String.length out_buf)
- then
- invalid_arg "Netconversion.recode";
-
- let reader =
- match in_enc with
- `Enc_iso88591 -> read_iso88591
- | `Enc_usascii -> read_usascii
- | `Enc_utf8 -> read_utf8 false
- | `Enc_java -> read_utf8 true
- | `Enc_utf16 -> read_utf16
- | `Enc_utf16_le -> read_utf16_le 0
- | `Enc_utf16_be -> read_utf16_be 0
- | _ ->
- (try
- let to_unicode' = Hashtbl.find Netmappings.to_unicode in_enc in
- let to_unicode =
- Netmappings.lock();
- Lazy.force to_unicode' in
- Netmappings.unlock();
- read_8bit to_unicode in_enc
- with
- Not_found ->
- failwith("Support for the encoding `" ^
- string_of_encoding in_enc ^
- "' has not been compiled into Netstring")
- )
- in
- let writer =
- match out_enc with
- `Enc_iso88591 -> write_iso88591 out_buf out_pos out_len max_chars subst
- | `Enc_usascii -> write_usascii out_buf out_pos out_len max_chars subst
- | `Enc_utf8 -> write_utf8 false
- out_buf out_pos out_len max_chars subst
- | `Enc_java -> write_utf8 true out_buf out_pos out_len max_chars subst
- | `Enc_utf16 -> failwith "Netconversion.recode"
- | `Enc_utf16_le -> write_utf16_le out_buf out_pos out_len max_chars subst
- | `Enc_utf16_be -> write_utf16_be out_buf out_pos out_len max_chars subst
- | _ ->
- (try
- let from_unicode' = Hashtbl.find Netmappings.from_unicode out_enc
- in
- let from_unicode =
- Netmappings.lock();
- Lazy.force from_unicode' in
- Netmappings.unlock();
- write_8bit from_unicode out_buf out_pos out_len max_chars subst
- with
- Not_found ->
- failwith("Support for the encoding `" ^
- string_of_encoding out_enc ^
- "' has not been compiled into Netstring")
- )
- in
- reader writer in_buf in_pos in_len
-;;
-
-
-let makechar enc p =
- match enc with
- `Enc_iso88591 ->
- if p > 255 then raise Not_found;
- String.make 1 (Char.chr p)
- | `Enc_usascii ->
- if p > 127 then raise Not_found;
- String.make 1 (Char.chr p)
- | `Enc_utf8 ->
- let s = String.create 4 in
- let n = write_utf8 false s 0 4 1 (fun _ -> raise Not_found) p 0 0 in
- String.sub s 0 n
- | `Enc_java ->
- let s = String.create 4 in
- let n = write_utf8 true s 0 4 1 (fun _ -> raise Not_found) p 0 0 in
- String.sub s 0 n
- | `Enc_utf16_le ->
- let s = String.create 4 in
- let n = write_utf16_le s 0 4 1 (fun _ -> raise Not_found) p 0 0 in
- String.sub s 0 n
- | `Enc_utf16_be ->
- let s = String.create 4 in
- let n = write_utf16_be s 0 4 1 (fun _ -> raise Not_found) p 0 0 in
- String.sub s 0 n
- | `Enc_utf16 ->
- failwith "Netconversion.makechar"
- | _ ->
- let s = String.create 1 in
- let from_unicode' =
- try
- Hashtbl.find Netmappings.from_unicode enc
- with
- Not_found ->
- failwith("Support for the encoding `" ^
- string_of_encoding enc ^
- "' has not been compiled into Netstring")
- in
- let from_unicode =
- Netmappings.lock();
- Lazy.force from_unicode' in
- Netmappings.unlock();
- let n =
- write_8bit from_unicode s 0 1 1 (fun _ -> raise Not_found) p 0 0 in
- s
-;;
-
-
-let recode_string ~in_enc ~out_enc ?(subst = (fun _ -> raise Not_found)) s =
-
- let length = String.length s in
- let size = 1024 in
- let out_buf = String.create size in
-
- let rec recode_loop k s_done in_enc =
- (* 'k' bytes of 's' have already been processed, and the result is in
- * 's_done'.
- *)
- (* Recode to 'out_buf': *)
- let in_len = length - k in
- let in_done, out_done, in_enc' =
- recode ~in_enc:in_enc ~in_buf:s ~in_pos:k ~in_len:in_len
- ~out_enc:out_enc ~out_buf:out_buf ~out_pos:0 ~out_len:size
- ~max_chars:size ~subst:subst in
- (* Collect the results: *)
- let k' = k + in_done in
- let s_done' = String.sub out_buf 0 out_done :: s_done in
- (* Still something to do? *)
- if k' < length then
- recode_loop k' s_done' in_enc'
- else
- (* No: Concatenate s_done' to get the final result. *)
- String.concat "" (List.rev s_done')
- in
-
- recode_loop 0 [] in_enc
-;;
-
-
-(* ======================================================================
- * History:
- *
- * $Log$
- * Revision 1.1 2000/11/17 09:57:28 lpadovan
- * Initial revision
- *
- * Revision 1.2 2000/08/29 00:46:41 gerd
- * New type for the Unicode to 8 bit translation table.
- * The Netmappings tables are now Lazy.t.
- *
- * Revision 1.1 2000/08/13 00:02:57 gerd
- * Initial revision.
- *
- *
- * ======================================================================
- * OLD LOGS FROM THE PXP PACKAGE (FILE NAME pxp_encoding.ml):
- *
- * Revision 1.5 2000/07/27 00:41:14 gerd
- * new 8 bit codes
- *
- * Revision 1.4 2000/07/04 22:11:41 gerd
- * Implemented the enhancements and extensions of
- * rev. 1.4 of pxp_encoding.mli.
- *
- * Revision 1.3 2000/05/29 23:48:38 gerd
- * Changed module names:
- * Markup_aux into Pxp_aux
- * Markup_codewriter into Pxp_codewriter
- * Markup_document into Pxp_document
- * Markup_dtd into Pxp_dtd
- * Markup_entity into Pxp_entity
- * Markup_lexer_types into Pxp_lexer_types
- * Markup_reader into Pxp_reader
- * Markup_types into Pxp_types
- * Markup_yacc into Pxp_yacc
- * See directory "compatibility" for (almost) compatible wrappers emulating
- * Markup_document, Markup_dtd, Markup_reader, Markup_types, and Markup_yacc.
- *
- * Revision 1.2 2000/05/29 21:14:57 gerd
- * Changed the type 'encoding' into a polymorphic variant.
- *
- * Revision 1.1 2000/05/20 20:30:50 gerd
- * Initial revision.
- *
- *
- *)