X-Git-Url: http://matita.cs.unibo.it/gitweb/?a=blobdiff_plain;f=helm%2FDEVEL%2Fpxp%2Fnetstring%2Fnetconversion.ml;fp=helm%2FDEVEL%2Fpxp%2Fnetstring%2Fnetconversion.ml;h=e740654ade9805b86902a499bc6a2838f304f801;hb=c03d2c1fdab8d228cb88aaba5ca0f556318bebc5;hp=0000000000000000000000000000000000000000;hpb=758057e85325f94cd88583feb1fdf6b038e35055;p=helm.git diff --git a/helm/DEVEL/pxp/netstring/netconversion.ml b/helm/DEVEL/pxp/netstring/netconversion.ml new file mode 100644 index 000000000..e740654ad --- /dev/null +++ b/helm/DEVEL/pxp/netstring/netconversion.ml @@ -0,0 +1,864 @@ +(* $Id$ + * ---------------------------------------------------------------------- + *) + +exception Malformed_code + + +type encoding = + [ `Enc_utf8 (* UTF-8 *) + | `Enc_java + | `Enc_utf16 (* UTF-16 with unspecified endianess (restricted usage) *) + | `Enc_utf16_le (* UTF-16 little endian *) + | `Enc_utf16_be (* UTF-16 big endian *) + | `Enc_usascii (* US-ASCII (only 7 bit) *) + | `Enc_iso88591 (* ISO-8859-1 *) + | `Enc_iso88592 (* ISO-8859-2 *) + | `Enc_iso88593 (* ISO-8859-3 *) + | `Enc_iso88594 (* ISO-8859-4 *) + | `Enc_iso88595 (* ISO-8859-5 *) + | `Enc_iso88596 (* ISO-8859-6 *) + | `Enc_iso88597 (* ISO-8859-7 *) + | `Enc_iso88598 (* ISO-8859-8 *) + | `Enc_iso88599 (* ISO-8859-9 *) + | `Enc_iso885910 (* ISO-8859-10 *) + | `Enc_iso885913 (* ISO-8859-13 *) + | `Enc_iso885914 (* ISO-8859-14 *) + | `Enc_iso885915 (* ISO-8859-15 *) + | `Enc_koi8r (* KOI8-R *) + | `Enc_jis0201 (* JIS-0201 *) + (* Microsoft: *) + | `Enc_windows1250 (* WINDOWS-1250 *) + | `Enc_windows1251 (* WINDOWS-1251 *) + | `Enc_windows1252 (* WINDOWS-1252 *) + | `Enc_windows1253 (* WINDOWS-1253 *) + | `Enc_windows1254 (* WINDOWS-1254 *) + | `Enc_windows1255 (* WINDOWS-1255 *) + | `Enc_windows1256 (* WINDOWS-1256 *) + | `Enc_windows1257 (* WINDOWS-1257 *) + | `Enc_windows1258 (* WINDOWS-1258 *) + (* IBM, ASCII-based: *) + | `Enc_cp437 + | `Enc_cp737 + | `Enc_cp775 + | `Enc_cp850 + | `Enc_cp852 + | `Enc_cp855 + | `Enc_cp856 + | `Enc_cp857 + | `Enc_cp860 + | `Enc_cp861 + | `Enc_cp862 + | `Enc_cp863 + | `Enc_cp864 + | `Enc_cp865 + | `Enc_cp866 + | `Enc_cp869 + | `Enc_cp874 + | `Enc_cp1006 + (* IBM, EBCDIC-based: *) + | `Enc_cp037 + | `Enc_cp424 + | `Enc_cp500 + | `Enc_cp875 + | `Enc_cp1026 + (* Adobe: *) + | `Enc_adobe_standard_encoding + | `Enc_adobe_symbol_encoding + | `Enc_adobe_zapf_dingbats_encoding + (* Apple: *) + | `Enc_macroman + + ] +;; + + +let norm_enc_name e = + (* Removes some characters from e; uppercase *) + let e' = String.create (String.length e) in + let rec next i j = + if i < String.length e then + match e.[i] with + ('-'|'_'|'.') -> next (i+1) j + | c -> e'.[j] <- c; next (i+1) (j+1) + else + j + in + let l = next 0 0 in + String.uppercase(String.sub e' 0 l) +;; + + +let encoding_of_string e = + match norm_enc_name e with + ("UTF16"|"UCS2"|"ISO10646UCS2") -> `Enc_utf16 + | "UTF16BE" -> `Enc_utf16_be + | "UTF16LE" -> `Enc_utf16_le + | "UTF8" -> `Enc_utf8 + | ("UTF8JAVA"|"JAVA") -> `Enc_java + | ("USASCII"|"ASCII"|"ISO646US"|"IBM367"|"CP367") -> `Enc_usascii + | ("ISO88591"|"LATIN1"|"IBM819"|"CP819") -> `Enc_iso88591 + | ("ISO88592"|"LATIN2") -> `Enc_iso88592 + | ("ISO88593"|"LATIN3") -> `Enc_iso88593 + | ("ISO88594"|"LATIN4") -> `Enc_iso88594 + | ("ISO88595"|"CYRILLIC") -> `Enc_iso88595 + | ("ISO88596"|"ARABIC"|"ECMA114"|"ASMO708") -> `Enc_iso88596 + | ("ISO88597"|"GREEK"|"GREEK8"|"ELOT928"|"ECMA118") -> `Enc_iso88597 + | ("ISO88598"|"HEBREW") -> `Enc_iso88598 + | ("ISO88599"|"LATIN5") -> `Enc_iso88599 + | ("ISO885910"|"LATIN6") -> `Enc_iso885910 + | "ISO885913" -> `Enc_iso885913 + | "ISO885914" -> `Enc_iso885914 + | "ISO885915" -> `Enc_iso885915 + | "KOI8R" -> `Enc_koi8r + | ("JIS0201"|"JISX0201"|"X0201") -> `Enc_jis0201 + + | "WINDOWS1250" -> `Enc_windows1250 + | "WINDOWS1251" -> `Enc_windows1251 + | "WINDOWS1252" -> `Enc_windows1252 + | "WINDOWS1253" -> `Enc_windows1253 + | "WINDOWS1254" -> `Enc_windows1254 + | "WINDOWS1255" -> `Enc_windows1255 + | "WINDOWS1256" -> `Enc_windows1256 + | "WINDOWS1257" -> `Enc_windows1257 + | "WINDOWS1258" -> `Enc_windows1258 + + | ("CP437"|"IBM437") -> `Enc_cp437 + | ("CP737"|"IBM737") -> `Enc_cp737 + | ("CP775"|"IBM775") -> `Enc_cp775 + | ("CP850"|"IBM850") -> `Enc_cp850 + | ("CP852"|"IBM852") -> `Enc_cp852 + | ("CP855"|"IBM855") -> `Enc_cp855 + | ("CP856"|"IBM856") -> `Enc_cp856 + | ("CP857"|"IBM857") -> `Enc_cp857 + | ("CP860"|"IBM860") -> `Enc_cp860 + | ("CP861"|"IBM861") -> `Enc_cp861 + | ("CP862"|"IBM862") -> `Enc_cp862 + | ("CP863"|"IBM863") -> `Enc_cp863 + | ("CP864"|"IBM864") -> `Enc_cp864 + | ("CP865"|"IBM865") -> `Enc_cp865 + | ("CP866"|"IBM866") -> `Enc_cp866 + | ("CP869"|"IBM869") -> `Enc_cp869 + | ("CP874"|"IBM874") -> `Enc_cp874 + | ("CP1006"|"IBM1006") -> `Enc_cp1006 + + | ("CP037"|"IBM037"|"EBCDICCPUS"|"EBCDICCPCA"|"EBCDICCPWT"| + "EBCDICCPNL") -> `Enc_cp037 + | ("CP424"|"IBM424"|"EBCDICCPHE") -> `Enc_cp424 + | ("CP500"|"IBM500"|"EBCDICCPBE"|"EBCDICCPCH") -> `Enc_cp500 + | ("CP875"|"IBM875") -> `Enc_cp875 + | ("CP1026"|"IBM1026") -> `Enc_cp1026 + + | "ADOBESTANDARDENCODING" -> `Enc_adobe_standard_encoding + | "ADOBESYMBOLENCODING" -> `Enc_adobe_symbol_encoding + | "ADOBEZAPFDINGBATSENCODING" -> `Enc_adobe_zapf_dingbats_encoding + + | "MACINTOSH" -> `Enc_macroman + + | _ -> + failwith "Netconversion.encoding_of_string: unknown encoding" +;; + + +let string_of_encoding (e : encoding) = + (* If there is a "preferred MIME name", this name is returned (see IANA). *) + match e with + `Enc_utf16 -> "UTF-16" + | `Enc_utf16_be -> "UTF-16-BE" + | `Enc_utf16_le -> "UTF-16-LE" + | `Enc_utf8 -> "UTF-8" + | `Enc_java -> "UTF-8-JAVA" + | `Enc_usascii -> "US-ASCII" + | `Enc_iso88591 -> "ISO-8859-1" + | `Enc_iso88592 -> "ISO-8859-2" + | `Enc_iso88593 -> "ISO-8859-3" + | `Enc_iso88594 -> "ISO-8859-4" + | `Enc_iso88595 -> "ISO-8859-5" + | `Enc_iso88596 -> "ISO-8859-6" + | `Enc_iso88597 -> "ISO-8859-7" + | `Enc_iso88598 -> "ISO-8859-8" + | `Enc_iso88599 -> "ISO-8859-9" + | `Enc_iso885910 -> "ISO-8859-10" + | `Enc_iso885913 -> "ISO-8859-13" + | `Enc_iso885914 -> "ISO-8859-14" + | `Enc_iso885915 -> "ISO-8859-15" + | `Enc_koi8r -> "KOI8-R" + | `Enc_jis0201 -> "JIS_X0201" + | `Enc_windows1250 -> "WINDOWS-1250" + | `Enc_windows1251 -> "WINDOWS-1251" + | `Enc_windows1252 -> "WINDOWS-1252" + | `Enc_windows1253 -> "WINDOWS-1253" + | `Enc_windows1254 -> "WINDOWS-1254" + | `Enc_windows1255 -> "WINDOWS-1255" + | `Enc_windows1256 -> "WINDOWS-1256" + | `Enc_windows1257 -> "WINDOWS-1257" + | `Enc_windows1258 -> "WINDOWS-1258" + | `Enc_cp437 -> "CP437" + | `Enc_cp737 -> "CP737" + | `Enc_cp775 -> "CP775" + | `Enc_cp850 -> "CP850" + | `Enc_cp852 -> "CP852" + | `Enc_cp855 -> "CP855" + | `Enc_cp856 -> "CP856" + | `Enc_cp857 -> "CP857" + | `Enc_cp860 -> "CP860" + | `Enc_cp861 -> "CP861" + | `Enc_cp862 -> "CP862" + | `Enc_cp863 -> "CP863" + | `Enc_cp864 -> "CP864" + | `Enc_cp865 -> "CP865" + | `Enc_cp866 -> "CP866" + | `Enc_cp869 -> "CP869" + | `Enc_cp874 -> "CP874" + | `Enc_cp1006 -> "CP1006" + | `Enc_cp037 -> "CP037" + | `Enc_cp424 -> "CP424" + | `Enc_cp500 -> "CP500" + | `Enc_cp875 -> "CP875" + | `Enc_cp1026 -> "CP1026" + | `Enc_adobe_standard_encoding -> "ADOBE-STANDARD-ENCODING" + | `Enc_adobe_symbol_encoding -> "ADOBE-SYMBOL-ENCODING" + | `Enc_adobe_zapf_dingbats_encoding -> "ADOBE-ZAPF-DINGBATS-ENCODING" + | `Enc_macroman -> "MACINTOSH" +;; + + +let read_iso88591 write s_in p_in l_in = + let rec scan k_in k_out c_out = + if k_in < l_in then begin + let p = Char.code s_in.[p_in + k_in] in + let n = write p k_out c_out in + if n < 0 then + k_in, k_out, `Enc_iso88591 + else + scan (k_in + 1) (k_out + n) (c_out + 1) + end + else + k_in, k_out, `Enc_iso88591 + in + scan 0 0 0 +;; + + +let read_usascii write s_in p_in l_in = + let rec scan k_in k_out c_out = + if k_in < l_in then begin + let p = Char.code s_in.[p_in + k_in] in + if p >= 0x80 then raise Malformed_code; + let n = write p k_out c_out in + if n < 0 then + k_in, k_out, `Enc_usascii + else + scan (k_in + 1) (k_out + n) (c_out + 1) + end + else + k_in, k_out, `Enc_usascii + in + scan 0 0 0 +;; + + +let read_8bit m_to_unicode enc write s_in p_in l_in = + let rec scan k_in k_out c_out = + if k_in < l_in then begin + let p_local = Char.code s_in.[p_in + k_in] in + let p_uni = Array.unsafe_get m_to_unicode p_local in + if p_uni < 0 then raise Malformed_code; + let n = write p_uni k_out c_out in + if n < 0 then + k_in, k_out, enc + else + scan (k_in + 1) (k_out + n) (c_out + 1) + end + else + k_in, k_out, enc + in + scan 0 0 0 +;; + + +let read_utf8 is_java write s_in p_in l_in = + let rec scan k_in k_out c_out = + if k_in < l_in then begin + let n_out, n_in = + match s_in.[p_in + k_in] with + '\000' -> + if is_java then raise Malformed_code; + write 0 k_out c_out, 1 + | ('\001'..'\127' as c) -> + write (Char.code c) k_out c_out, 1 + | ('\128'..'\223' as c) -> + if k_in + 1 >= l_in then + -1, 0 + else begin + let n1 = Char.code c in + let n2 = Char.code (s_in.[p_in + k_in + 1]) in + if is_java && (n1 = 0x80 && n2 = 0xc0) then + write 0 k_out c_out, 2 + else begin + if n2 < 128 or n2 > 191 then raise Malformed_code; + let p = ((n1 land 0b11111) lsl 6) lor (n2 land 0b111111) in + if p < 128 then raise Malformed_code; + write p k_out c_out, 2 + end + end + | ('\224'..'\239' as c) -> + if k_in + 2 >= l_in then + -1, 0 + else begin + let n1 = Char.code c in + let n2 = Char.code (s_in.[p_in + k_in + 1]) in + let n3 = Char.code (s_in.[p_in + k_in + 2]) in + if n2 < 128 or n2 > 191 then raise Malformed_code; + if n3 < 128 or n3 > 191 then raise Malformed_code; + let p = + ((n1 land 0b1111) lsl 12) lor + ((n2 land 0b111111) lsl 6) lor + (n3 land 0b111111) + in + if p < 0x800 then raise Malformed_code; + if (p >= 0xd800 && p < 0xe000) then + (* Surrogate pairs are not supported in UTF-8 *) + raise Malformed_code; + if (p >= 0xfffe && p <= 0xffff) then + raise Malformed_code; + write p k_out c_out, 3 + end + | ('\240'..'\247' as c) -> + if k_in + 3 >= l_in then + -1, 0 + else begin + let n1 = Char.code c in + let n2 = Char.code (s_in.[p_in + k_in + 1]) in + let n3 = Char.code (s_in.[p_in + k_in + 2]) in + let n4 = Char.code (s_in.[p_in + k_in + 3]) in + if n2 < 128 or n2 > 191 then raise Malformed_code; + if n3 < 128 or n3 > 191 then raise Malformed_code; + if n4 < 128 or n4 > 191 then raise Malformed_code; + let p = ((n1 land 0b111) lsl 18) lor + ((n2 land 0b111111) lsl 12) lor + ((n3 land 0b111111) lsl 6) lor + (n4 land 0b111111) + in + if p < 0x10000 then raise Malformed_code; + if p >= 0x110000 then + (* These code points are not supported. *) + raise Malformed_code; + write p k_out c_out, 4 + end + | _ -> + (* Outside the valid range of XML characters *) + raise Malformed_code; + in + (* n_out: number of written bytes; -1 means out buf is full + * n_in: number of read bytes; 0 means end of in buf reached + * n_in = 0 implies n_out = -1 + *) + if n_out < 0 then + k_in, k_out, `Enc_utf8 + else + scan (k_in + n_in) (k_out + n_out) (c_out + 1) + end + else + k_in, k_out, `Enc_utf8 + in + scan 0 0 0 +;; + + +let surrogate_offset = 0x10000 - (0xD800 lsl 10) - 0xDC00;; + +let read_utf16_le k_in_0 write s_in p_in l_in = + let rec scan k_in k_out c_out = + if k_in + 1 < l_in then begin + let p = (Char.code s_in.[p_in + k_in]) lor ((Char.code s_in.[p_in + k_in + 1]) lsl 8) in + + if p >= 0xd800 & p < 0xe000 then begin + (* This is a surrogate pair. *) + if k_in + 3 < l_in then begin + if p <= 0xdbff then begin + let q = (Char.code s_in.[p_in + k_in + 2 ]) lor + ((Char.code s_in.[p_in + k_in + 3]) lsl 8) in + if q < 0xdc00 or q > 0xdfff then raise Malformed_code; + let eff_p = (p lsl 10) + q + surrogate_offset in + let n = write eff_p k_out c_out in + if n < 0 then + k_in, k_out, `Enc_utf16_le + else + scan (k_in + 4) (k_out + n) (c_out + 1) + end + else + (* Malformed pair: *) + raise Malformed_code; + end + else + (* Incomplete pair: *) + k_in, k_out, `Enc_utf16_le + end + + else + if p = 0xfffe then + (* Big endian byte order mark: It is illegal here *) + raise Malformed_code + else begin + (* A regular code point *) + let n = write p k_out c_out in + if n < 0 then + k_in, k_out, `Enc_utf16_le + else + scan (k_in + 2) (k_out + n) (c_out + 1) + end + end + else + (* Incomplete character: *) + k_in, k_out, `Enc_utf16_le + in + scan k_in_0 0 0 +;; + + +let read_utf16_be k_in_0 write s_in p_in l_in = + let rec scan k_in k_out c_out = + if k_in + 1 < l_in then begin + let p = (Char.code s_in.[p_in + k_in + 1]) lor ((Char.code s_in.[p_in + k_in]) lsl 8) in + + if p >= 0xd800 & p < 0xe000 then begin + (* This is a surrogate pair. *) + if k_in + 3 < l_in then begin + if p <= 0xdbff then begin + let q = (Char.code s_in.[p_in + k_in + 3 ]) lor + ((Char.code s_in.[p_in + k_in + 2]) lsl 8) in + if q < 0xdc00 or q > 0xdfff then raise Malformed_code; + let eff_p = (p lsl 10) + q + surrogate_offset in + let n = write eff_p k_out c_out in + if n < 0 then + k_in, k_out, `Enc_utf16_be + else + scan (k_in + 4) (k_out + n) (c_out + 1) + end + else + (* Malformed pair: *) + raise Malformed_code; + end + else + (* Incomplete pair: *) + k_in, k_out, `Enc_utf16_be + end + + else + if p = 0xfffe then + (* Little endian byte order mark: It is illegal here *) + raise Malformed_code + else begin + (* A regular code point *) + let n = write p k_out c_out in + if n < 0 then + k_in, k_out, `Enc_utf16_be + else + scan (k_in + 2) (k_out + n) (c_out + 1) + end + + end + else + (* Incomplete character: *) + k_in, k_out, `Enc_utf16_be + in + scan k_in_0 0 0 +;; + + +let read_utf16 write s_in p_in l_in = + (* Expect a BOM at the beginning of the text *) + if l_in >= 2 then begin + let c0 = s_in.[p_in + 0] in + let c1 = s_in.[p_in + 1] in + if c0 = '\254' & c1 = '\255' then begin + (* 0xfeff as big endian *) + read_utf16_be 2 write s_in p_in l_in + end + else + if c0 = '\255' & c1 = '\254' then begin + (* 0xfeff as little endian *) + read_utf16_le 2 write s_in p_in l_in + end + else + (* byte order mark missing *) + raise Malformed_code + end + else + 0, 0, `Enc_utf16 +;; + + +let write_iso88591 s_out p_out l_out max_chars w p k_out c_out = + if k_out < l_out && c_out < max_chars then begin + if p > 255 then begin + let subst = w p in + let l_subst = String.length subst in + if k_out + l_subst <= l_out then begin + (* Enough space to store 'subst': *) + String.blit subst 0 s_out (k_out+p_out) l_subst; + l_subst + end + else + (* Not enough space: Stop this round of recoding *) + -1 + end + else begin + s_out.[p_out + k_out] <- Char.chr p; + 1 + end + end + else + -1 (* End-of-buffer indicator *) +;; + + +let write_usascii s_out p_out l_out max_chars w p k_out c_out = + if k_out < l_out && c_out < max_chars then begin + if p > 127 then begin + let subst = w p in + let l_subst = String.length subst in + if k_out + l_subst <= l_out then begin + (* Enough space to store 'subst': *) + String.blit subst 0 s_out (k_out+p_out) l_subst; + l_subst + end + else + (* Not enough space: Stop this round of recoding *) + -1 + end + else begin + s_out.[p_out + k_out] <- Char.chr p; + 1 + end + end + else + -1 (* End-of-buffer indicator *) +;; + + +let write_8bit from_unicode s_out p_out l_out max_chars w p k_out c_out = + if k_out < l_out && c_out < max_chars then begin + let p' = + match Array.unsafe_get from_unicode (p land 255) with + Netmappings.U_nil -> -1 + | Netmappings.U_single (p0,q0) -> + if p0 = p then q0 else -1 + | Netmappings.U_list l -> + (try List.assoc p l with Not_found -> -1) + in + if p' < 0 then begin + let subst = w p in + let l_subst = String.length subst in + if k_out + l_subst <= l_out then begin + (* Enough space to store 'subst': *) + String.blit subst 0 s_out (k_out+p_out) l_subst; + l_subst + end + else + (* Not enough space: Stop this round of recoding *) + -1 + end + else begin + s_out.[p_out + k_out] <- Char.chr p'; + 1 + end + end + else + -1 (* End-of-buffer indicator *) +;; + + +let write_utf8 is_java s_out p_out l_out max_chars w p k_out c_out = + if p <= 127 && (not is_java || p <> 0) then begin + if k_out < l_out && c_out < max_chars then begin + s_out.[p_out + k_out] <- Char.chr p; + 1 + end + else -1 + end + else if p <= 0x7ff then begin + if k_out + 1 < l_out && c_out < max_chars then begin + s_out.[p_out + k_out] <- Char.chr (0xc0 lor (p lsr 6)); + s_out.[p_out + k_out + 1] <- Char.chr (0x80 lor (p land 0x3f)); + 2 + end + else -1 + end + else if p <= 0xffff then begin + (* Refuse writing surrogate pairs, and fffe, ffff *) + if (p >= 0xd800 & p < 0xe000) or (p >= 0xfffe) then + failwith "Netconversion.write_utf8"; + if k_out + 2 < l_out && c_out < max_chars then begin + s_out.[p_out + k_out] <- Char.chr (0xe0 lor (p lsr 12)); + s_out.[p_out + k_out + 1] <- Char.chr (0x80 lor ((p lsr 6) land 0x3f)); + s_out.[p_out + k_out + 2] <- Char.chr (0x80 lor (p land 0x3f)); + 3 + end + else -1 + end + else if p <= 0x10ffff then begin + if k_out + 3 < l_out && c_out < max_chars then begin + s_out.[p_out + k_out] <- Char.chr (0xf0 lor (p lsr 18)); + s_out.[p_out + k_out + 1] <- Char.chr (0x80 lor ((p lsr 12) land 0x3f)); + s_out.[p_out + k_out + 2] <- Char.chr (0x80 lor ((p lsr 6) land 0x3f)); + s_out.[p_out + k_out + 3] <- Char.chr (0x80 lor (p land 0x3f)); + 4 + end + else -1 + end + else + (* Higher code points are not possible in XML: *) + failwith "Netconversion.write_utf8" +;; + + +let write_utf16_le s_out p_out l_out max_chars w p k_out c_out = + if p >= 0xfffe then begin + if p <= 0xffff or p > 0x10ffff then failwith "Netconversion.write_utf16_le"; + (* Must be written as surrogate pair *) + if k_out + 3 < l_out && c_out < max_chars then begin + let high = (p lsr 10) + 0xd800 in + let low = (p land 0x3ff) + 0xdc00 in + s_out.[p_out + k_out ] <- Char.chr (high land 0xff); + s_out.[p_out + k_out + 1] <- Char.chr (high lsr 8); + s_out.[p_out + k_out + 2] <- Char.chr (low land 0xff); + s_out.[p_out + k_out + 3] <- Char.chr (low lsr 8); + 4 + end + else -1 + end + else begin + if k_out + 1 < l_out && c_out < max_chars then begin + s_out.[p_out + k_out ] <- Char.chr (p land 0xff); + s_out.[p_out + k_out + 1] <- Char.chr (p lsr 8); + 2 + end + else + -1 + end +;; + + +let write_utf16_be s_out p_out l_out max_chars w p k_out c_out = + if p >= 0xfffe then begin + if p <= 0xffff or p > 0x10ffff then failwith "Netconversion.write_utf16_be"; + (* Must be written as surrogate pair *) + if k_out + 3 < l_out && c_out < max_chars then begin + let high = (p lsr 10) + 0xd800 in + let low = (p land 0x3ff) + 0xdc00 in + s_out.[p_out + k_out + 1] <- Char.chr (high land 0xff); + s_out.[p_out + k_out ] <- Char.chr (high lsr 8); + s_out.[p_out + k_out + 3] <- Char.chr (low land 0xff); + s_out.[p_out + k_out + 2] <- Char.chr (low lsr 8); + 4 + end + else -1 + end + else begin + if k_out + 1 < l_out && c_out < max_chars then begin + s_out.[p_out + k_out + 1] <- Char.chr (p land 0xff); + s_out.[p_out + k_out ] <- Char.chr (p lsr 8); + 2 + end + else + -1 + end +;; + + +let recode ~in_enc + ~in_buf + ~in_pos + ~in_len + ~out_enc + ~out_buf + ~out_pos + ~out_len + ~max_chars + ~subst = + if (in_pos < 0 || in_len < 0 || in_pos + in_len > String.length in_buf || + out_pos < 0 || out_len < 0 || out_pos + out_len > String.length out_buf) + then + invalid_arg "Netconversion.recode"; + + let reader = + match in_enc with + `Enc_iso88591 -> read_iso88591 + | `Enc_usascii -> read_usascii + | `Enc_utf8 -> read_utf8 false + | `Enc_java -> read_utf8 true + | `Enc_utf16 -> read_utf16 + | `Enc_utf16_le -> read_utf16_le 0 + | `Enc_utf16_be -> read_utf16_be 0 + | _ -> + (try + let to_unicode' = Hashtbl.find Netmappings.to_unicode in_enc in + let to_unicode = + Netmappings.lock(); + Lazy.force to_unicode' in + Netmappings.unlock(); + read_8bit to_unicode in_enc + with + Not_found -> + failwith("Support for the encoding `" ^ + string_of_encoding in_enc ^ + "' has not been compiled into Netstring") + ) + in + let writer = + match out_enc with + `Enc_iso88591 -> write_iso88591 out_buf out_pos out_len max_chars subst + | `Enc_usascii -> write_usascii out_buf out_pos out_len max_chars subst + | `Enc_utf8 -> write_utf8 false + out_buf out_pos out_len max_chars subst + | `Enc_java -> write_utf8 true out_buf out_pos out_len max_chars subst + | `Enc_utf16 -> failwith "Netconversion.recode" + | `Enc_utf16_le -> write_utf16_le out_buf out_pos out_len max_chars subst + | `Enc_utf16_be -> write_utf16_be out_buf out_pos out_len max_chars subst + | _ -> + (try + let from_unicode' = Hashtbl.find Netmappings.from_unicode out_enc + in + let from_unicode = + Netmappings.lock(); + Lazy.force from_unicode' in + Netmappings.unlock(); + write_8bit from_unicode out_buf out_pos out_len max_chars subst + with + Not_found -> + failwith("Support for the encoding `" ^ + string_of_encoding out_enc ^ + "' has not been compiled into Netstring") + ) + in + reader writer in_buf in_pos in_len +;; + + +let makechar enc p = + match enc with + `Enc_iso88591 -> + if p > 255 then raise Not_found; + String.make 1 (Char.chr p) + | `Enc_usascii -> + if p > 127 then raise Not_found; + String.make 1 (Char.chr p) + | `Enc_utf8 -> + let s = String.create 4 in + let n = write_utf8 false s 0 4 1 (fun _ -> raise Not_found) p 0 0 in + String.sub s 0 n + | `Enc_java -> + let s = String.create 4 in + let n = write_utf8 true s 0 4 1 (fun _ -> raise Not_found) p 0 0 in + String.sub s 0 n + | `Enc_utf16_le -> + let s = String.create 4 in + let n = write_utf16_le s 0 4 1 (fun _ -> raise Not_found) p 0 0 in + String.sub s 0 n + | `Enc_utf16_be -> + let s = String.create 4 in + let n = write_utf16_be s 0 4 1 (fun _ -> raise Not_found) p 0 0 in + String.sub s 0 n + | `Enc_utf16 -> + failwith "Netconversion.makechar" + | _ -> + let s = String.create 1 in + let from_unicode' = + try + Hashtbl.find Netmappings.from_unicode enc + with + Not_found -> + failwith("Support for the encoding `" ^ + string_of_encoding enc ^ + "' has not been compiled into Netstring") + in + let from_unicode = + Netmappings.lock(); + Lazy.force from_unicode' in + Netmappings.unlock(); + let n = + write_8bit from_unicode s 0 1 1 (fun _ -> raise Not_found) p 0 0 in + s +;; + + +let recode_string ~in_enc ~out_enc ?(subst = (fun _ -> raise Not_found)) s = + + let length = String.length s in + let size = 1024 in + let out_buf = String.create size in + + let rec recode_loop k s_done in_enc = + (* 'k' bytes of 's' have already been processed, and the result is in + * 's_done'. + *) + (* Recode to 'out_buf': *) + let in_len = length - k in + let in_done, out_done, in_enc' = + recode ~in_enc:in_enc ~in_buf:s ~in_pos:k ~in_len:in_len + ~out_enc:out_enc ~out_buf:out_buf ~out_pos:0 ~out_len:size + ~max_chars:size ~subst:subst in + (* Collect the results: *) + let k' = k + in_done in + let s_done' = String.sub out_buf 0 out_done :: s_done in + (* Still something to do? *) + if k' < length then + recode_loop k' s_done' in_enc' + else + (* No: Concatenate s_done' to get the final result. *) + String.concat "" (List.rev s_done') + in + + recode_loop 0 [] in_enc +;; + + +(* ====================================================================== + * History: + * + * $Log$ + * Revision 1.1 2000/11/17 09:57:28 lpadovan + * Initial revision + * + * Revision 1.2 2000/08/29 00:46:41 gerd + * New type for the Unicode to 8 bit translation table. + * The Netmappings tables are now Lazy.t. + * + * Revision 1.1 2000/08/13 00:02:57 gerd + * Initial revision. + * + * + * ====================================================================== + * OLD LOGS FROM THE PXP PACKAGE (FILE NAME pxp_encoding.ml): + * + * Revision 1.5 2000/07/27 00:41:14 gerd + * new 8 bit codes + * + * Revision 1.4 2000/07/04 22:11:41 gerd + * Implemented the enhancements and extensions of + * rev. 1.4 of pxp_encoding.mli. + * + * Revision 1.3 2000/05/29 23:48:38 gerd + * Changed module names: + * Markup_aux into Pxp_aux + * Markup_codewriter into Pxp_codewriter + * Markup_document into Pxp_document + * Markup_dtd into Pxp_dtd + * Markup_entity into Pxp_entity + * Markup_lexer_types into Pxp_lexer_types + * Markup_reader into Pxp_reader + * Markup_types into Pxp_types + * Markup_yacc into Pxp_yacc + * See directory "compatibility" for (almost) compatible wrappers emulating + * Markup_document, Markup_dtd, Markup_reader, Markup_types, and Markup_yacc. + * + * Revision 1.2 2000/05/29 21:14:57 gerd + * Changed the type 'encoding' into a polymorphic variant. + * + * Revision 1.1 2000/05/20 20:30:50 gerd + * Initial revision. + * + * + *)