X-Git-Url: http://matita.cs.unibo.it/gitweb/?a=blobdiff_plain;f=helm%2FDEVEL%2Fpxp%2Fnetstring%2Fnetconversion.ml;fp=helm%2FDEVEL%2Fpxp%2Fnetstring%2Fnetconversion.ml;h=0000000000000000000000000000000000000000;hb=3ef089a4c58fbe429dd539af6215991ecbe11ee2;hp=e740654ade9805b86902a499bc6a2838f304f801;hpb=1c7fb836e2af4f2f3d18afd0396701f2094265ff;p=helm.git diff --git a/helm/DEVEL/pxp/netstring/netconversion.ml b/helm/DEVEL/pxp/netstring/netconversion.ml deleted file mode 100644 index e740654ad..000000000 --- a/helm/DEVEL/pxp/netstring/netconversion.ml +++ /dev/null @@ -1,864 +0,0 @@ -(* $Id$ - * ---------------------------------------------------------------------- - *) - -exception Malformed_code - - -type encoding = - [ `Enc_utf8 (* UTF-8 *) - | `Enc_java - | `Enc_utf16 (* UTF-16 with unspecified endianess (restricted usage) *) - | `Enc_utf16_le (* UTF-16 little endian *) - | `Enc_utf16_be (* UTF-16 big endian *) - | `Enc_usascii (* US-ASCII (only 7 bit) *) - | `Enc_iso88591 (* ISO-8859-1 *) - | `Enc_iso88592 (* ISO-8859-2 *) - | `Enc_iso88593 (* ISO-8859-3 *) - | `Enc_iso88594 (* ISO-8859-4 *) - | `Enc_iso88595 (* ISO-8859-5 *) - | `Enc_iso88596 (* ISO-8859-6 *) - | `Enc_iso88597 (* ISO-8859-7 *) - | `Enc_iso88598 (* ISO-8859-8 *) - | `Enc_iso88599 (* ISO-8859-9 *) - | `Enc_iso885910 (* ISO-8859-10 *) - | `Enc_iso885913 (* ISO-8859-13 *) - | `Enc_iso885914 (* ISO-8859-14 *) - | `Enc_iso885915 (* ISO-8859-15 *) - | `Enc_koi8r (* KOI8-R *) - | `Enc_jis0201 (* JIS-0201 *) - (* Microsoft: *) - | `Enc_windows1250 (* WINDOWS-1250 *) - | `Enc_windows1251 (* WINDOWS-1251 *) - | `Enc_windows1252 (* WINDOWS-1252 *) - | `Enc_windows1253 (* WINDOWS-1253 *) - | `Enc_windows1254 (* WINDOWS-1254 *) - | `Enc_windows1255 (* WINDOWS-1255 *) - | `Enc_windows1256 (* WINDOWS-1256 *) - | `Enc_windows1257 (* WINDOWS-1257 *) - | `Enc_windows1258 (* WINDOWS-1258 *) - (* IBM, ASCII-based: *) - | `Enc_cp437 - | `Enc_cp737 - | `Enc_cp775 - | `Enc_cp850 - | `Enc_cp852 - | `Enc_cp855 - | `Enc_cp856 - | `Enc_cp857 - | `Enc_cp860 - | `Enc_cp861 - | `Enc_cp862 - | `Enc_cp863 - | `Enc_cp864 - | `Enc_cp865 - | `Enc_cp866 - | `Enc_cp869 - | `Enc_cp874 - | `Enc_cp1006 - (* IBM, EBCDIC-based: *) - | `Enc_cp037 - | `Enc_cp424 - | `Enc_cp500 - | `Enc_cp875 - | `Enc_cp1026 - (* Adobe: *) - | `Enc_adobe_standard_encoding - | `Enc_adobe_symbol_encoding - | `Enc_adobe_zapf_dingbats_encoding - (* Apple: *) - | `Enc_macroman - - ] -;; - - -let norm_enc_name e = - (* Removes some characters from e; uppercase *) - let e' = String.create (String.length e) in - let rec next i j = - if i < String.length e then - match e.[i] with - ('-'|'_'|'.') -> next (i+1) j - | c -> e'.[j] <- c; next (i+1) (j+1) - else - j - in - let l = next 0 0 in - String.uppercase(String.sub e' 0 l) -;; - - -let encoding_of_string e = - match norm_enc_name e with - ("UTF16"|"UCS2"|"ISO10646UCS2") -> `Enc_utf16 - | "UTF16BE" -> `Enc_utf16_be - | "UTF16LE" -> `Enc_utf16_le - | "UTF8" -> `Enc_utf8 - | ("UTF8JAVA"|"JAVA") -> `Enc_java - | ("USASCII"|"ASCII"|"ISO646US"|"IBM367"|"CP367") -> `Enc_usascii - | ("ISO88591"|"LATIN1"|"IBM819"|"CP819") -> `Enc_iso88591 - | ("ISO88592"|"LATIN2") -> `Enc_iso88592 - | ("ISO88593"|"LATIN3") -> `Enc_iso88593 - | ("ISO88594"|"LATIN4") -> `Enc_iso88594 - | ("ISO88595"|"CYRILLIC") -> `Enc_iso88595 - | ("ISO88596"|"ARABIC"|"ECMA114"|"ASMO708") -> `Enc_iso88596 - | ("ISO88597"|"GREEK"|"GREEK8"|"ELOT928"|"ECMA118") -> `Enc_iso88597 - | ("ISO88598"|"HEBREW") -> `Enc_iso88598 - | ("ISO88599"|"LATIN5") -> `Enc_iso88599 - | ("ISO885910"|"LATIN6") -> `Enc_iso885910 - | "ISO885913" -> `Enc_iso885913 - | "ISO885914" -> `Enc_iso885914 - | "ISO885915" -> `Enc_iso885915 - | "KOI8R" -> `Enc_koi8r - | ("JIS0201"|"JISX0201"|"X0201") -> `Enc_jis0201 - - | "WINDOWS1250" -> `Enc_windows1250 - | "WINDOWS1251" -> `Enc_windows1251 - | "WINDOWS1252" -> `Enc_windows1252 - | "WINDOWS1253" -> `Enc_windows1253 - | "WINDOWS1254" -> `Enc_windows1254 - | "WINDOWS1255" -> `Enc_windows1255 - | "WINDOWS1256" -> `Enc_windows1256 - | "WINDOWS1257" -> `Enc_windows1257 - | "WINDOWS1258" -> `Enc_windows1258 - - | ("CP437"|"IBM437") -> `Enc_cp437 - | ("CP737"|"IBM737") -> `Enc_cp737 - | ("CP775"|"IBM775") -> `Enc_cp775 - | ("CP850"|"IBM850") -> `Enc_cp850 - | ("CP852"|"IBM852") -> `Enc_cp852 - | ("CP855"|"IBM855") -> `Enc_cp855 - | ("CP856"|"IBM856") -> `Enc_cp856 - | ("CP857"|"IBM857") -> `Enc_cp857 - | ("CP860"|"IBM860") -> `Enc_cp860 - | ("CP861"|"IBM861") -> `Enc_cp861 - | ("CP862"|"IBM862") -> `Enc_cp862 - | ("CP863"|"IBM863") -> `Enc_cp863 - | ("CP864"|"IBM864") -> `Enc_cp864 - | ("CP865"|"IBM865") -> `Enc_cp865 - | ("CP866"|"IBM866") -> `Enc_cp866 - | ("CP869"|"IBM869") -> `Enc_cp869 - | ("CP874"|"IBM874") -> `Enc_cp874 - | ("CP1006"|"IBM1006") -> `Enc_cp1006 - - | ("CP037"|"IBM037"|"EBCDICCPUS"|"EBCDICCPCA"|"EBCDICCPWT"| - "EBCDICCPNL") -> `Enc_cp037 - | ("CP424"|"IBM424"|"EBCDICCPHE") -> `Enc_cp424 - | ("CP500"|"IBM500"|"EBCDICCPBE"|"EBCDICCPCH") -> `Enc_cp500 - | ("CP875"|"IBM875") -> `Enc_cp875 - | ("CP1026"|"IBM1026") -> `Enc_cp1026 - - | "ADOBESTANDARDENCODING" -> `Enc_adobe_standard_encoding - | "ADOBESYMBOLENCODING" -> `Enc_adobe_symbol_encoding - | "ADOBEZAPFDINGBATSENCODING" -> `Enc_adobe_zapf_dingbats_encoding - - | "MACINTOSH" -> `Enc_macroman - - | _ -> - failwith "Netconversion.encoding_of_string: unknown encoding" -;; - - -let string_of_encoding (e : encoding) = - (* If there is a "preferred MIME name", this name is returned (see IANA). *) - match e with - `Enc_utf16 -> "UTF-16" - | `Enc_utf16_be -> "UTF-16-BE" - | `Enc_utf16_le -> "UTF-16-LE" - | `Enc_utf8 -> "UTF-8" - | `Enc_java -> "UTF-8-JAVA" - | `Enc_usascii -> "US-ASCII" - | `Enc_iso88591 -> "ISO-8859-1" - | `Enc_iso88592 -> "ISO-8859-2" - | `Enc_iso88593 -> "ISO-8859-3" - | `Enc_iso88594 -> "ISO-8859-4" - | `Enc_iso88595 -> "ISO-8859-5" - | `Enc_iso88596 -> "ISO-8859-6" - | `Enc_iso88597 -> "ISO-8859-7" - | `Enc_iso88598 -> "ISO-8859-8" - | `Enc_iso88599 -> "ISO-8859-9" - | `Enc_iso885910 -> "ISO-8859-10" - | `Enc_iso885913 -> "ISO-8859-13" - | `Enc_iso885914 -> "ISO-8859-14" - | `Enc_iso885915 -> "ISO-8859-15" - | `Enc_koi8r -> "KOI8-R" - | `Enc_jis0201 -> "JIS_X0201" - | `Enc_windows1250 -> "WINDOWS-1250" - | `Enc_windows1251 -> "WINDOWS-1251" - | `Enc_windows1252 -> "WINDOWS-1252" - | `Enc_windows1253 -> "WINDOWS-1253" - | `Enc_windows1254 -> "WINDOWS-1254" - | `Enc_windows1255 -> "WINDOWS-1255" - | `Enc_windows1256 -> "WINDOWS-1256" - | `Enc_windows1257 -> "WINDOWS-1257" - | `Enc_windows1258 -> "WINDOWS-1258" - | `Enc_cp437 -> "CP437" - | `Enc_cp737 -> "CP737" - | `Enc_cp775 -> "CP775" - | `Enc_cp850 -> "CP850" - | `Enc_cp852 -> "CP852" - | `Enc_cp855 -> "CP855" - | `Enc_cp856 -> "CP856" - | `Enc_cp857 -> "CP857" - | `Enc_cp860 -> "CP860" - | `Enc_cp861 -> "CP861" - | `Enc_cp862 -> "CP862" - | `Enc_cp863 -> "CP863" - | `Enc_cp864 -> "CP864" - | `Enc_cp865 -> "CP865" - | `Enc_cp866 -> "CP866" - | `Enc_cp869 -> "CP869" - | `Enc_cp874 -> "CP874" - | `Enc_cp1006 -> "CP1006" - | `Enc_cp037 -> "CP037" - | `Enc_cp424 -> "CP424" - | `Enc_cp500 -> "CP500" - | `Enc_cp875 -> "CP875" - | `Enc_cp1026 -> "CP1026" - | `Enc_adobe_standard_encoding -> "ADOBE-STANDARD-ENCODING" - | `Enc_adobe_symbol_encoding -> "ADOBE-SYMBOL-ENCODING" - | `Enc_adobe_zapf_dingbats_encoding -> "ADOBE-ZAPF-DINGBATS-ENCODING" - | `Enc_macroman -> "MACINTOSH" -;; - - -let read_iso88591 write s_in p_in l_in = - let rec scan k_in k_out c_out = - if k_in < l_in then begin - let p = Char.code s_in.[p_in + k_in] in - let n = write p k_out c_out in - if n < 0 then - k_in, k_out, `Enc_iso88591 - else - scan (k_in + 1) (k_out + n) (c_out + 1) - end - else - k_in, k_out, `Enc_iso88591 - in - scan 0 0 0 -;; - - -let read_usascii write s_in p_in l_in = - let rec scan k_in k_out c_out = - if k_in < l_in then begin - let p = Char.code s_in.[p_in + k_in] in - if p >= 0x80 then raise Malformed_code; - let n = write p k_out c_out in - if n < 0 then - k_in, k_out, `Enc_usascii - else - scan (k_in + 1) (k_out + n) (c_out + 1) - end - else - k_in, k_out, `Enc_usascii - in - scan 0 0 0 -;; - - -let read_8bit m_to_unicode enc write s_in p_in l_in = - let rec scan k_in k_out c_out = - if k_in < l_in then begin - let p_local = Char.code s_in.[p_in + k_in] in - let p_uni = Array.unsafe_get m_to_unicode p_local in - if p_uni < 0 then raise Malformed_code; - let n = write p_uni k_out c_out in - if n < 0 then - k_in, k_out, enc - else - scan (k_in + 1) (k_out + n) (c_out + 1) - end - else - k_in, k_out, enc - in - scan 0 0 0 -;; - - -let read_utf8 is_java write s_in p_in l_in = - let rec scan k_in k_out c_out = - if k_in < l_in then begin - let n_out, n_in = - match s_in.[p_in + k_in] with - '\000' -> - if is_java then raise Malformed_code; - write 0 k_out c_out, 1 - | ('\001'..'\127' as c) -> - write (Char.code c) k_out c_out, 1 - | ('\128'..'\223' as c) -> - if k_in + 1 >= l_in then - -1, 0 - else begin - let n1 = Char.code c in - let n2 = Char.code (s_in.[p_in + k_in + 1]) in - if is_java && (n1 = 0x80 && n2 = 0xc0) then - write 0 k_out c_out, 2 - else begin - if n2 < 128 or n2 > 191 then raise Malformed_code; - let p = ((n1 land 0b11111) lsl 6) lor (n2 land 0b111111) in - if p < 128 then raise Malformed_code; - write p k_out c_out, 2 - end - end - | ('\224'..'\239' as c) -> - if k_in + 2 >= l_in then - -1, 0 - else begin - let n1 = Char.code c in - let n2 = Char.code (s_in.[p_in + k_in + 1]) in - let n3 = Char.code (s_in.[p_in + k_in + 2]) in - if n2 < 128 or n2 > 191 then raise Malformed_code; - if n3 < 128 or n3 > 191 then raise Malformed_code; - let p = - ((n1 land 0b1111) lsl 12) lor - ((n2 land 0b111111) lsl 6) lor - (n3 land 0b111111) - in - if p < 0x800 then raise Malformed_code; - if (p >= 0xd800 && p < 0xe000) then - (* Surrogate pairs are not supported in UTF-8 *) - raise Malformed_code; - if (p >= 0xfffe && p <= 0xffff) then - raise Malformed_code; - write p k_out c_out, 3 - end - | ('\240'..'\247' as c) -> - if k_in + 3 >= l_in then - -1, 0 - else begin - let n1 = Char.code c in - let n2 = Char.code (s_in.[p_in + k_in + 1]) in - let n3 = Char.code (s_in.[p_in + k_in + 2]) in - let n4 = Char.code (s_in.[p_in + k_in + 3]) in - if n2 < 128 or n2 > 191 then raise Malformed_code; - if n3 < 128 or n3 > 191 then raise Malformed_code; - if n4 < 128 or n4 > 191 then raise Malformed_code; - let p = ((n1 land 0b111) lsl 18) lor - ((n2 land 0b111111) lsl 12) lor - ((n3 land 0b111111) lsl 6) lor - (n4 land 0b111111) - in - if p < 0x10000 then raise Malformed_code; - if p >= 0x110000 then - (* These code points are not supported. *) - raise Malformed_code; - write p k_out c_out, 4 - end - | _ -> - (* Outside the valid range of XML characters *) - raise Malformed_code; - in - (* n_out: number of written bytes; -1 means out buf is full - * n_in: number of read bytes; 0 means end of in buf reached - * n_in = 0 implies n_out = -1 - *) - if n_out < 0 then - k_in, k_out, `Enc_utf8 - else - scan (k_in + n_in) (k_out + n_out) (c_out + 1) - end - else - k_in, k_out, `Enc_utf8 - in - scan 0 0 0 -;; - - -let surrogate_offset = 0x10000 - (0xD800 lsl 10) - 0xDC00;; - -let read_utf16_le k_in_0 write s_in p_in l_in = - let rec scan k_in k_out c_out = - if k_in + 1 < l_in then begin - let p = (Char.code s_in.[p_in + k_in]) lor ((Char.code s_in.[p_in + k_in + 1]) lsl 8) in - - if p >= 0xd800 & p < 0xe000 then begin - (* This is a surrogate pair. *) - if k_in + 3 < l_in then begin - if p <= 0xdbff then begin - let q = (Char.code s_in.[p_in + k_in + 2 ]) lor - ((Char.code s_in.[p_in + k_in + 3]) lsl 8) in - if q < 0xdc00 or q > 0xdfff then raise Malformed_code; - let eff_p = (p lsl 10) + q + surrogate_offset in - let n = write eff_p k_out c_out in - if n < 0 then - k_in, k_out, `Enc_utf16_le - else - scan (k_in + 4) (k_out + n) (c_out + 1) - end - else - (* Malformed pair: *) - raise Malformed_code; - end - else - (* Incomplete pair: *) - k_in, k_out, `Enc_utf16_le - end - - else - if p = 0xfffe then - (* Big endian byte order mark: It is illegal here *) - raise Malformed_code - else begin - (* A regular code point *) - let n = write p k_out c_out in - if n < 0 then - k_in, k_out, `Enc_utf16_le - else - scan (k_in + 2) (k_out + n) (c_out + 1) - end - end - else - (* Incomplete character: *) - k_in, k_out, `Enc_utf16_le - in - scan k_in_0 0 0 -;; - - -let read_utf16_be k_in_0 write s_in p_in l_in = - let rec scan k_in k_out c_out = - if k_in + 1 < l_in then begin - let p = (Char.code s_in.[p_in + k_in + 1]) lor ((Char.code s_in.[p_in + k_in]) lsl 8) in - - if p >= 0xd800 & p < 0xe000 then begin - (* This is a surrogate pair. *) - if k_in + 3 < l_in then begin - if p <= 0xdbff then begin - let q = (Char.code s_in.[p_in + k_in + 3 ]) lor - ((Char.code s_in.[p_in + k_in + 2]) lsl 8) in - if q < 0xdc00 or q > 0xdfff then raise Malformed_code; - let eff_p = (p lsl 10) + q + surrogate_offset in - let n = write eff_p k_out c_out in - if n < 0 then - k_in, k_out, `Enc_utf16_be - else - scan (k_in + 4) (k_out + n) (c_out + 1) - end - else - (* Malformed pair: *) - raise Malformed_code; - end - else - (* Incomplete pair: *) - k_in, k_out, `Enc_utf16_be - end - - else - if p = 0xfffe then - (* Little endian byte order mark: It is illegal here *) - raise Malformed_code - else begin - (* A regular code point *) - let n = write p k_out c_out in - if n < 0 then - k_in, k_out, `Enc_utf16_be - else - scan (k_in + 2) (k_out + n) (c_out + 1) - end - - end - else - (* Incomplete character: *) - k_in, k_out, `Enc_utf16_be - in - scan k_in_0 0 0 -;; - - -let read_utf16 write s_in p_in l_in = - (* Expect a BOM at the beginning of the text *) - if l_in >= 2 then begin - let c0 = s_in.[p_in + 0] in - let c1 = s_in.[p_in + 1] in - if c0 = '\254' & c1 = '\255' then begin - (* 0xfeff as big endian *) - read_utf16_be 2 write s_in p_in l_in - end - else - if c0 = '\255' & c1 = '\254' then begin - (* 0xfeff as little endian *) - read_utf16_le 2 write s_in p_in l_in - end - else - (* byte order mark missing *) - raise Malformed_code - end - else - 0, 0, `Enc_utf16 -;; - - -let write_iso88591 s_out p_out l_out max_chars w p k_out c_out = - if k_out < l_out && c_out < max_chars then begin - if p > 255 then begin - let subst = w p in - let l_subst = String.length subst in - if k_out + l_subst <= l_out then begin - (* Enough space to store 'subst': *) - String.blit subst 0 s_out (k_out+p_out) l_subst; - l_subst - end - else - (* Not enough space: Stop this round of recoding *) - -1 - end - else begin - s_out.[p_out + k_out] <- Char.chr p; - 1 - end - end - else - -1 (* End-of-buffer indicator *) -;; - - -let write_usascii s_out p_out l_out max_chars w p k_out c_out = - if k_out < l_out && c_out < max_chars then begin - if p > 127 then begin - let subst = w p in - let l_subst = String.length subst in - if k_out + l_subst <= l_out then begin - (* Enough space to store 'subst': *) - String.blit subst 0 s_out (k_out+p_out) l_subst; - l_subst - end - else - (* Not enough space: Stop this round of recoding *) - -1 - end - else begin - s_out.[p_out + k_out] <- Char.chr p; - 1 - end - end - else - -1 (* End-of-buffer indicator *) -;; - - -let write_8bit from_unicode s_out p_out l_out max_chars w p k_out c_out = - if k_out < l_out && c_out < max_chars then begin - let p' = - match Array.unsafe_get from_unicode (p land 255) with - Netmappings.U_nil -> -1 - | Netmappings.U_single (p0,q0) -> - if p0 = p then q0 else -1 - | Netmappings.U_list l -> - (try List.assoc p l with Not_found -> -1) - in - if p' < 0 then begin - let subst = w p in - let l_subst = String.length subst in - if k_out + l_subst <= l_out then begin - (* Enough space to store 'subst': *) - String.blit subst 0 s_out (k_out+p_out) l_subst; - l_subst - end - else - (* Not enough space: Stop this round of recoding *) - -1 - end - else begin - s_out.[p_out + k_out] <- Char.chr p'; - 1 - end - end - else - -1 (* End-of-buffer indicator *) -;; - - -let write_utf8 is_java s_out p_out l_out max_chars w p k_out c_out = - if p <= 127 && (not is_java || p <> 0) then begin - if k_out < l_out && c_out < max_chars then begin - s_out.[p_out + k_out] <- Char.chr p; - 1 - end - else -1 - end - else if p <= 0x7ff then begin - if k_out + 1 < l_out && c_out < max_chars then begin - s_out.[p_out + k_out] <- Char.chr (0xc0 lor (p lsr 6)); - s_out.[p_out + k_out + 1] <- Char.chr (0x80 lor (p land 0x3f)); - 2 - end - else -1 - end - else if p <= 0xffff then begin - (* Refuse writing surrogate pairs, and fffe, ffff *) - if (p >= 0xd800 & p < 0xe000) or (p >= 0xfffe) then - failwith "Netconversion.write_utf8"; - if k_out + 2 < l_out && c_out < max_chars then begin - s_out.[p_out + k_out] <- Char.chr (0xe0 lor (p lsr 12)); - s_out.[p_out + k_out + 1] <- Char.chr (0x80 lor ((p lsr 6) land 0x3f)); - s_out.[p_out + k_out + 2] <- Char.chr (0x80 lor (p land 0x3f)); - 3 - end - else -1 - end - else if p <= 0x10ffff then begin - if k_out + 3 < l_out && c_out < max_chars then begin - s_out.[p_out + k_out] <- Char.chr (0xf0 lor (p lsr 18)); - s_out.[p_out + k_out + 1] <- Char.chr (0x80 lor ((p lsr 12) land 0x3f)); - s_out.[p_out + k_out + 2] <- Char.chr (0x80 lor ((p lsr 6) land 0x3f)); - s_out.[p_out + k_out + 3] <- Char.chr (0x80 lor (p land 0x3f)); - 4 - end - else -1 - end - else - (* Higher code points are not possible in XML: *) - failwith "Netconversion.write_utf8" -;; - - -let write_utf16_le s_out p_out l_out max_chars w p k_out c_out = - if p >= 0xfffe then begin - if p <= 0xffff or p > 0x10ffff then failwith "Netconversion.write_utf16_le"; - (* Must be written as surrogate pair *) - if k_out + 3 < l_out && c_out < max_chars then begin - let high = (p lsr 10) + 0xd800 in - let low = (p land 0x3ff) + 0xdc00 in - s_out.[p_out + k_out ] <- Char.chr (high land 0xff); - s_out.[p_out + k_out + 1] <- Char.chr (high lsr 8); - s_out.[p_out + k_out + 2] <- Char.chr (low land 0xff); - s_out.[p_out + k_out + 3] <- Char.chr (low lsr 8); - 4 - end - else -1 - end - else begin - if k_out + 1 < l_out && c_out < max_chars then begin - s_out.[p_out + k_out ] <- Char.chr (p land 0xff); - s_out.[p_out + k_out + 1] <- Char.chr (p lsr 8); - 2 - end - else - -1 - end -;; - - -let write_utf16_be s_out p_out l_out max_chars w p k_out c_out = - if p >= 0xfffe then begin - if p <= 0xffff or p > 0x10ffff then failwith "Netconversion.write_utf16_be"; - (* Must be written as surrogate pair *) - if k_out + 3 < l_out && c_out < max_chars then begin - let high = (p lsr 10) + 0xd800 in - let low = (p land 0x3ff) + 0xdc00 in - s_out.[p_out + k_out + 1] <- Char.chr (high land 0xff); - s_out.[p_out + k_out ] <- Char.chr (high lsr 8); - s_out.[p_out + k_out + 3] <- Char.chr (low land 0xff); - s_out.[p_out + k_out + 2] <- Char.chr (low lsr 8); - 4 - end - else -1 - end - else begin - if k_out + 1 < l_out && c_out < max_chars then begin - s_out.[p_out + k_out + 1] <- Char.chr (p land 0xff); - s_out.[p_out + k_out ] <- Char.chr (p lsr 8); - 2 - end - else - -1 - end -;; - - -let recode ~in_enc - ~in_buf - ~in_pos - ~in_len - ~out_enc - ~out_buf - ~out_pos - ~out_len - ~max_chars - ~subst = - if (in_pos < 0 || in_len < 0 || in_pos + in_len > String.length in_buf || - out_pos < 0 || out_len < 0 || out_pos + out_len > String.length out_buf) - then - invalid_arg "Netconversion.recode"; - - let reader = - match in_enc with - `Enc_iso88591 -> read_iso88591 - | `Enc_usascii -> read_usascii - | `Enc_utf8 -> read_utf8 false - | `Enc_java -> read_utf8 true - | `Enc_utf16 -> read_utf16 - | `Enc_utf16_le -> read_utf16_le 0 - | `Enc_utf16_be -> read_utf16_be 0 - | _ -> - (try - let to_unicode' = Hashtbl.find Netmappings.to_unicode in_enc in - let to_unicode = - Netmappings.lock(); - Lazy.force to_unicode' in - Netmappings.unlock(); - read_8bit to_unicode in_enc - with - Not_found -> - failwith("Support for the encoding `" ^ - string_of_encoding in_enc ^ - "' has not been compiled into Netstring") - ) - in - let writer = - match out_enc with - `Enc_iso88591 -> write_iso88591 out_buf out_pos out_len max_chars subst - | `Enc_usascii -> write_usascii out_buf out_pos out_len max_chars subst - | `Enc_utf8 -> write_utf8 false - out_buf out_pos out_len max_chars subst - | `Enc_java -> write_utf8 true out_buf out_pos out_len max_chars subst - | `Enc_utf16 -> failwith "Netconversion.recode" - | `Enc_utf16_le -> write_utf16_le out_buf out_pos out_len max_chars subst - | `Enc_utf16_be -> write_utf16_be out_buf out_pos out_len max_chars subst - | _ -> - (try - let from_unicode' = Hashtbl.find Netmappings.from_unicode out_enc - in - let from_unicode = - Netmappings.lock(); - Lazy.force from_unicode' in - Netmappings.unlock(); - write_8bit from_unicode out_buf out_pos out_len max_chars subst - with - Not_found -> - failwith("Support for the encoding `" ^ - string_of_encoding out_enc ^ - "' has not been compiled into Netstring") - ) - in - reader writer in_buf in_pos in_len -;; - - -let makechar enc p = - match enc with - `Enc_iso88591 -> - if p > 255 then raise Not_found; - String.make 1 (Char.chr p) - | `Enc_usascii -> - if p > 127 then raise Not_found; - String.make 1 (Char.chr p) - | `Enc_utf8 -> - let s = String.create 4 in - let n = write_utf8 false s 0 4 1 (fun _ -> raise Not_found) p 0 0 in - String.sub s 0 n - | `Enc_java -> - let s = String.create 4 in - let n = write_utf8 true s 0 4 1 (fun _ -> raise Not_found) p 0 0 in - String.sub s 0 n - | `Enc_utf16_le -> - let s = String.create 4 in - let n = write_utf16_le s 0 4 1 (fun _ -> raise Not_found) p 0 0 in - String.sub s 0 n - | `Enc_utf16_be -> - let s = String.create 4 in - let n = write_utf16_be s 0 4 1 (fun _ -> raise Not_found) p 0 0 in - String.sub s 0 n - | `Enc_utf16 -> - failwith "Netconversion.makechar" - | _ -> - let s = String.create 1 in - let from_unicode' = - try - Hashtbl.find Netmappings.from_unicode enc - with - Not_found -> - failwith("Support for the encoding `" ^ - string_of_encoding enc ^ - "' has not been compiled into Netstring") - in - let from_unicode = - Netmappings.lock(); - Lazy.force from_unicode' in - Netmappings.unlock(); - let n = - write_8bit from_unicode s 0 1 1 (fun _ -> raise Not_found) p 0 0 in - s -;; - - -let recode_string ~in_enc ~out_enc ?(subst = (fun _ -> raise Not_found)) s = - - let length = String.length s in - let size = 1024 in - let out_buf = String.create size in - - let rec recode_loop k s_done in_enc = - (* 'k' bytes of 's' have already been processed, and the result is in - * 's_done'. - *) - (* Recode to 'out_buf': *) - let in_len = length - k in - let in_done, out_done, in_enc' = - recode ~in_enc:in_enc ~in_buf:s ~in_pos:k ~in_len:in_len - ~out_enc:out_enc ~out_buf:out_buf ~out_pos:0 ~out_len:size - ~max_chars:size ~subst:subst in - (* Collect the results: *) - let k' = k + in_done in - let s_done' = String.sub out_buf 0 out_done :: s_done in - (* Still something to do? *) - if k' < length then - recode_loop k' s_done' in_enc' - else - (* No: Concatenate s_done' to get the final result. *) - String.concat "" (List.rev s_done') - in - - recode_loop 0 [] in_enc -;; - - -(* ====================================================================== - * History: - * - * $Log$ - * Revision 1.1 2000/11/17 09:57:28 lpadovan - * Initial revision - * - * Revision 1.2 2000/08/29 00:46:41 gerd - * New type for the Unicode to 8 bit translation table. - * The Netmappings tables are now Lazy.t. - * - * Revision 1.1 2000/08/13 00:02:57 gerd - * Initial revision. - * - * - * ====================================================================== - * OLD LOGS FROM THE PXP PACKAGE (FILE NAME pxp_encoding.ml): - * - * Revision 1.5 2000/07/27 00:41:14 gerd - * new 8 bit codes - * - * Revision 1.4 2000/07/04 22:11:41 gerd - * Implemented the enhancements and extensions of - * rev. 1.4 of pxp_encoding.mli. - * - * Revision 1.3 2000/05/29 23:48:38 gerd - * Changed module names: - * Markup_aux into Pxp_aux - * Markup_codewriter into Pxp_codewriter - * Markup_document into Pxp_document - * Markup_dtd into Pxp_dtd - * Markup_entity into Pxp_entity - * Markup_lexer_types into Pxp_lexer_types - * Markup_reader into Pxp_reader - * Markup_types into Pxp_types - * Markup_yacc into Pxp_yacc - * See directory "compatibility" for (almost) compatible wrappers emulating - * Markup_document, Markup_dtd, Markup_reader, Markup_types, and Markup_yacc. - * - * Revision 1.2 2000/05/29 21:14:57 gerd - * Changed the type 'encoding' into a polymorphic variant. - * - * Revision 1.1 2000/05/20 20:30:50 gerd - * Initial revision. - * - * - *)