(* $Id$ * ---------------------------------------------------------------------- *) exception Malformed_code type encoding = [ `Enc_utf8 (* UTF-8 *) | `Enc_java | `Enc_utf16 (* UTF-16 with unspecified endianess (restricted usage) *) | `Enc_utf16_le (* UTF-16 little endian *) | `Enc_utf16_be (* UTF-16 big endian *) | `Enc_usascii (* US-ASCII (only 7 bit) *) | `Enc_iso88591 (* ISO-8859-1 *) | `Enc_iso88592 (* ISO-8859-2 *) | `Enc_iso88593 (* ISO-8859-3 *) | `Enc_iso88594 (* ISO-8859-4 *) | `Enc_iso88595 (* ISO-8859-5 *) | `Enc_iso88596 (* ISO-8859-6 *) | `Enc_iso88597 (* ISO-8859-7 *) | `Enc_iso88598 (* ISO-8859-8 *) | `Enc_iso88599 (* ISO-8859-9 *) | `Enc_iso885910 (* ISO-8859-10 *) | `Enc_iso885913 (* ISO-8859-13 *) | `Enc_iso885914 (* ISO-8859-14 *) | `Enc_iso885915 (* ISO-8859-15 *) | `Enc_koi8r (* KOI8-R *) | `Enc_jis0201 (* JIS-0201 *) (* Microsoft: *) | `Enc_windows1250 (* WINDOWS-1250 *) | `Enc_windows1251 (* WINDOWS-1251 *) | `Enc_windows1252 (* WINDOWS-1252 *) | `Enc_windows1253 (* WINDOWS-1253 *) | `Enc_windows1254 (* WINDOWS-1254 *) | `Enc_windows1255 (* WINDOWS-1255 *) | `Enc_windows1256 (* WINDOWS-1256 *) | `Enc_windows1257 (* WINDOWS-1257 *) | `Enc_windows1258 (* WINDOWS-1258 *) (* IBM, ASCII-based: *) | `Enc_cp437 | `Enc_cp737 | `Enc_cp775 | `Enc_cp850 | `Enc_cp852 | `Enc_cp855 | `Enc_cp856 | `Enc_cp857 | `Enc_cp860 | `Enc_cp861 | `Enc_cp862 | `Enc_cp863 | `Enc_cp864 | `Enc_cp865 | `Enc_cp866 | `Enc_cp869 | `Enc_cp874 | `Enc_cp1006 (* IBM, EBCDIC-based: *) | `Enc_cp037 | `Enc_cp424 | `Enc_cp500 | `Enc_cp875 | `Enc_cp1026 (* Adobe: *) | `Enc_adobe_standard_encoding | `Enc_adobe_symbol_encoding | `Enc_adobe_zapf_dingbats_encoding (* Apple: *) | `Enc_macroman ] ;; let norm_enc_name e = (* Removes some characters from e; uppercase *) let e' = String.create (String.length e) in let rec next i j = if i < String.length e then match e.[i] with ('-'|'_'|'.') -> next (i+1) j | c -> e'.[j] <- c; next (i+1) (j+1) else j in let l = next 0 0 in String.uppercase(String.sub e' 0 l) ;; let encoding_of_string e = match norm_enc_name e with ("UTF16"|"UCS2"|"ISO10646UCS2") -> `Enc_utf16 | "UTF16BE" -> `Enc_utf16_be | "UTF16LE" -> `Enc_utf16_le | "UTF8" -> `Enc_utf8 | ("UTF8JAVA"|"JAVA") -> `Enc_java | ("USASCII"|"ASCII"|"ISO646US"|"IBM367"|"CP367") -> `Enc_usascii | ("ISO88591"|"LATIN1"|"IBM819"|"CP819") -> `Enc_iso88591 | ("ISO88592"|"LATIN2") -> `Enc_iso88592 | ("ISO88593"|"LATIN3") -> `Enc_iso88593 | ("ISO88594"|"LATIN4") -> `Enc_iso88594 | ("ISO88595"|"CYRILLIC") -> `Enc_iso88595 | ("ISO88596"|"ARABIC"|"ECMA114"|"ASMO708") -> `Enc_iso88596 | ("ISO88597"|"GREEK"|"GREEK8"|"ELOT928"|"ECMA118") -> `Enc_iso88597 | ("ISO88598"|"HEBREW") -> `Enc_iso88598 | ("ISO88599"|"LATIN5") -> `Enc_iso88599 | ("ISO885910"|"LATIN6") -> `Enc_iso885910 | "ISO885913" -> `Enc_iso885913 | "ISO885914" -> `Enc_iso885914 | "ISO885915" -> `Enc_iso885915 | "KOI8R" -> `Enc_koi8r | ("JIS0201"|"JISX0201"|"X0201") -> `Enc_jis0201 | "WINDOWS1250" -> `Enc_windows1250 | "WINDOWS1251" -> `Enc_windows1251 | "WINDOWS1252" -> `Enc_windows1252 | "WINDOWS1253" -> `Enc_windows1253 | "WINDOWS1254" -> `Enc_windows1254 | "WINDOWS1255" -> `Enc_windows1255 | "WINDOWS1256" -> `Enc_windows1256 | "WINDOWS1257" -> `Enc_windows1257 | "WINDOWS1258" -> `Enc_windows1258 | ("CP437"|"IBM437") -> `Enc_cp437 | ("CP737"|"IBM737") -> `Enc_cp737 | ("CP775"|"IBM775") -> `Enc_cp775 | ("CP850"|"IBM850") -> `Enc_cp850 | ("CP852"|"IBM852") -> `Enc_cp852 | ("CP855"|"IBM855") -> `Enc_cp855 | ("CP856"|"IBM856") -> `Enc_cp856 | ("CP857"|"IBM857") -> `Enc_cp857 | ("CP860"|"IBM860") -> `Enc_cp860 | ("CP861"|"IBM861") -> `Enc_cp861 | ("CP862"|"IBM862") -> `Enc_cp862 | ("CP863"|"IBM863") -> `Enc_cp863 | ("CP864"|"IBM864") -> `Enc_cp864 | ("CP865"|"IBM865") -> `Enc_cp865 | ("CP866"|"IBM866") -> `Enc_cp866 | ("CP869"|"IBM869") -> `Enc_cp869 | ("CP874"|"IBM874") -> `Enc_cp874 | ("CP1006"|"IBM1006") -> `Enc_cp1006 | ("CP037"|"IBM037"|"EBCDICCPUS"|"EBCDICCPCA"|"EBCDICCPWT"| "EBCDICCPNL") -> `Enc_cp037 | ("CP424"|"IBM424"|"EBCDICCPHE") -> `Enc_cp424 | ("CP500"|"IBM500"|"EBCDICCPBE"|"EBCDICCPCH") -> `Enc_cp500 | ("CP875"|"IBM875") -> `Enc_cp875 | ("CP1026"|"IBM1026") -> `Enc_cp1026 | "ADOBESTANDARDENCODING" -> `Enc_adobe_standard_encoding | "ADOBESYMBOLENCODING" -> `Enc_adobe_symbol_encoding | "ADOBEZAPFDINGBATSENCODING" -> `Enc_adobe_zapf_dingbats_encoding | "MACINTOSH" -> `Enc_macroman | _ -> failwith "Netconversion.encoding_of_string: unknown encoding" ;; let string_of_encoding (e : encoding) = (* If there is a "preferred MIME name", this name is returned (see IANA). *) match e with `Enc_utf16 -> "UTF-16" | `Enc_utf16_be -> "UTF-16-BE" | `Enc_utf16_le -> "UTF-16-LE" | `Enc_utf8 -> "UTF-8" | `Enc_java -> "UTF-8-JAVA" | `Enc_usascii -> "US-ASCII" | `Enc_iso88591 -> "ISO-8859-1" | `Enc_iso88592 -> "ISO-8859-2" | `Enc_iso88593 -> "ISO-8859-3" | `Enc_iso88594 -> "ISO-8859-4" | `Enc_iso88595 -> "ISO-8859-5" | `Enc_iso88596 -> "ISO-8859-6" | `Enc_iso88597 -> "ISO-8859-7" | `Enc_iso88598 -> "ISO-8859-8" | `Enc_iso88599 -> "ISO-8859-9" | `Enc_iso885910 -> "ISO-8859-10" | `Enc_iso885913 -> "ISO-8859-13" | `Enc_iso885914 -> "ISO-8859-14" | `Enc_iso885915 -> "ISO-8859-15" | `Enc_koi8r -> "KOI8-R" | `Enc_jis0201 -> "JIS_X0201" | `Enc_windows1250 -> "WINDOWS-1250" | `Enc_windows1251 -> "WINDOWS-1251" | `Enc_windows1252 -> "WINDOWS-1252" | `Enc_windows1253 -> "WINDOWS-1253" | `Enc_windows1254 -> "WINDOWS-1254" | `Enc_windows1255 -> "WINDOWS-1255" | `Enc_windows1256 -> "WINDOWS-1256" | `Enc_windows1257 -> "WINDOWS-1257" | `Enc_windows1258 -> "WINDOWS-1258" | `Enc_cp437 -> "CP437" | `Enc_cp737 -> "CP737" | `Enc_cp775 -> "CP775" | `Enc_cp850 -> "CP850" | `Enc_cp852 -> "CP852" | `Enc_cp855 -> "CP855" | `Enc_cp856 -> "CP856" | `Enc_cp857 -> "CP857" | `Enc_cp860 -> "CP860" | `Enc_cp861 -> "CP861" | `Enc_cp862 -> "CP862" | `Enc_cp863 -> "CP863" | `Enc_cp864 -> "CP864" | `Enc_cp865 -> "CP865" | `Enc_cp866 -> "CP866" | `Enc_cp869 -> "CP869" | `Enc_cp874 -> "CP874" | `Enc_cp1006 -> "CP1006" | `Enc_cp037 -> "CP037" | `Enc_cp424 -> "CP424" | `Enc_cp500 -> "CP500" | `Enc_cp875 -> "CP875" | `Enc_cp1026 -> "CP1026" | `Enc_adobe_standard_encoding -> "ADOBE-STANDARD-ENCODING" | `Enc_adobe_symbol_encoding -> "ADOBE-SYMBOL-ENCODING" | `Enc_adobe_zapf_dingbats_encoding -> "ADOBE-ZAPF-DINGBATS-ENCODING" | `Enc_macroman -> "MACINTOSH" ;; let read_iso88591 write s_in p_in l_in = let rec scan k_in k_out c_out = if k_in < l_in then begin let p = Char.code s_in.[p_in + k_in] in let n = write p k_out c_out in if n < 0 then k_in, k_out, `Enc_iso88591 else scan (k_in + 1) (k_out + n) (c_out + 1) end else k_in, k_out, `Enc_iso88591 in scan 0 0 0 ;; let read_usascii write s_in p_in l_in = let rec scan k_in k_out c_out = if k_in < l_in then begin let p = Char.code s_in.[p_in + k_in] in if p >= 0x80 then raise Malformed_code; let n = write p k_out c_out in if n < 0 then k_in, k_out, `Enc_usascii else scan (k_in + 1) (k_out + n) (c_out + 1) end else k_in, k_out, `Enc_usascii in scan 0 0 0 ;; let read_8bit m_to_unicode enc write s_in p_in l_in = let rec scan k_in k_out c_out = if k_in < l_in then begin let p_local = Char.code s_in.[p_in + k_in] in let p_uni = Array.unsafe_get m_to_unicode p_local in if p_uni < 0 then raise Malformed_code; let n = write p_uni k_out c_out in if n < 0 then k_in, k_out, enc else scan (k_in + 1) (k_out + n) (c_out + 1) end else k_in, k_out, enc in scan 0 0 0 ;; let read_utf8 is_java write s_in p_in l_in = let rec scan k_in k_out c_out = if k_in < l_in then begin let n_out, n_in = match s_in.[p_in + k_in] with '\000' -> if is_java then raise Malformed_code; write 0 k_out c_out, 1 | ('\001'..'\127' as c) -> write (Char.code c) k_out c_out, 1 | ('\128'..'\223' as c) -> if k_in + 1 >= l_in then -1, 0 else begin let n1 = Char.code c in let n2 = Char.code (s_in.[p_in + k_in + 1]) in if is_java && (n1 = 0x80 && n2 = 0xc0) then write 0 k_out c_out, 2 else begin if n2 < 128 or n2 > 191 then raise Malformed_code; let p = ((n1 land 0b11111) lsl 6) lor (n2 land 0b111111) in if p < 128 then raise Malformed_code; write p k_out c_out, 2 end end | ('\224'..'\239' as c) -> if k_in + 2 >= l_in then -1, 0 else begin let n1 = Char.code c in let n2 = Char.code (s_in.[p_in + k_in + 1]) in let n3 = Char.code (s_in.[p_in + k_in + 2]) in if n2 < 128 or n2 > 191 then raise Malformed_code; if n3 < 128 or n3 > 191 then raise Malformed_code; let p = ((n1 land 0b1111) lsl 12) lor ((n2 land 0b111111) lsl 6) lor (n3 land 0b111111) in if p < 0x800 then raise Malformed_code; if (p >= 0xd800 && p < 0xe000) then (* Surrogate pairs are not supported in UTF-8 *) raise Malformed_code; if (p >= 0xfffe && p <= 0xffff) then raise Malformed_code; write p k_out c_out, 3 end | ('\240'..'\247' as c) -> if k_in + 3 >= l_in then -1, 0 else begin let n1 = Char.code c in let n2 = Char.code (s_in.[p_in + k_in + 1]) in let n3 = Char.code (s_in.[p_in + k_in + 2]) in let n4 = Char.code (s_in.[p_in + k_in + 3]) in if n2 < 128 or n2 > 191 then raise Malformed_code; if n3 < 128 or n3 > 191 then raise Malformed_code; if n4 < 128 or n4 > 191 then raise Malformed_code; let p = ((n1 land 0b111) lsl 18) lor ((n2 land 0b111111) lsl 12) lor ((n3 land 0b111111) lsl 6) lor (n4 land 0b111111) in if p < 0x10000 then raise Malformed_code; if p >= 0x110000 then (* These code points are not supported. *) raise Malformed_code; write p k_out c_out, 4 end | _ -> (* Outside the valid range of XML characters *) raise Malformed_code; in (* n_out: number of written bytes; -1 means out buf is full * n_in: number of read bytes; 0 means end of in buf reached * n_in = 0 implies n_out = -1 *) if n_out < 0 then k_in, k_out, `Enc_utf8 else scan (k_in + n_in) (k_out + n_out) (c_out + 1) end else k_in, k_out, `Enc_utf8 in scan 0 0 0 ;; let surrogate_offset = 0x10000 - (0xD800 lsl 10) - 0xDC00;; let read_utf16_le k_in_0 write s_in p_in l_in = let rec scan k_in k_out c_out = if k_in + 1 < l_in then begin let p = (Char.code s_in.[p_in + k_in]) lor ((Char.code s_in.[p_in + k_in + 1]) lsl 8) in if p >= 0xd800 & p < 0xe000 then begin (* This is a surrogate pair. *) if k_in + 3 < l_in then begin if p <= 0xdbff then begin let q = (Char.code s_in.[p_in + k_in + 2 ]) lor ((Char.code s_in.[p_in + k_in + 3]) lsl 8) in if q < 0xdc00 or q > 0xdfff then raise Malformed_code; let eff_p = (p lsl 10) + q + surrogate_offset in let n = write eff_p k_out c_out in if n < 0 then k_in, k_out, `Enc_utf16_le else scan (k_in + 4) (k_out + n) (c_out + 1) end else (* Malformed pair: *) raise Malformed_code; end else (* Incomplete pair: *) k_in, k_out, `Enc_utf16_le end else if p = 0xfffe then (* Big endian byte order mark: It is illegal here *) raise Malformed_code else begin (* A regular code point *) let n = write p k_out c_out in if n < 0 then k_in, k_out, `Enc_utf16_le else scan (k_in + 2) (k_out + n) (c_out + 1) end end else (* Incomplete character: *) k_in, k_out, `Enc_utf16_le in scan k_in_0 0 0 ;; let read_utf16_be k_in_0 write s_in p_in l_in = let rec scan k_in k_out c_out = if k_in + 1 < l_in then begin let p = (Char.code s_in.[p_in + k_in + 1]) lor ((Char.code s_in.[p_in + k_in]) lsl 8) in if p >= 0xd800 & p < 0xe000 then begin (* This is a surrogate pair. *) if k_in + 3 < l_in then begin if p <= 0xdbff then begin let q = (Char.code s_in.[p_in + k_in + 3 ]) lor ((Char.code s_in.[p_in + k_in + 2]) lsl 8) in if q < 0xdc00 or q > 0xdfff then raise Malformed_code; let eff_p = (p lsl 10) + q + surrogate_offset in let n = write eff_p k_out c_out in if n < 0 then k_in, k_out, `Enc_utf16_be else scan (k_in + 4) (k_out + n) (c_out + 1) end else (* Malformed pair: *) raise Malformed_code; end else (* Incomplete pair: *) k_in, k_out, `Enc_utf16_be end else if p = 0xfffe then (* Little endian byte order mark: It is illegal here *) raise Malformed_code else begin (* A regular code point *) let n = write p k_out c_out in if n < 0 then k_in, k_out, `Enc_utf16_be else scan (k_in + 2) (k_out + n) (c_out + 1) end end else (* Incomplete character: *) k_in, k_out, `Enc_utf16_be in scan k_in_0 0 0 ;; let read_utf16 write s_in p_in l_in = (* Expect a BOM at the beginning of the text *) if l_in >= 2 then begin let c0 = s_in.[p_in + 0] in let c1 = s_in.[p_in + 1] in if c0 = '\254' & c1 = '\255' then begin (* 0xfeff as big endian *) read_utf16_be 2 write s_in p_in l_in end else if c0 = '\255' & c1 = '\254' then begin (* 0xfeff as little endian *) read_utf16_le 2 write s_in p_in l_in end else (* byte order mark missing *) raise Malformed_code end else 0, 0, `Enc_utf16 ;; let write_iso88591 s_out p_out l_out max_chars w p k_out c_out = if k_out < l_out && c_out < max_chars then begin if p > 255 then begin let subst = w p in let l_subst = String.length subst in if k_out + l_subst <= l_out then begin (* Enough space to store 'subst': *) String.blit subst 0 s_out (k_out+p_out) l_subst; l_subst end else (* Not enough space: Stop this round of recoding *) -1 end else begin s_out.[p_out + k_out] <- Char.chr p; 1 end end else -1 (* End-of-buffer indicator *) ;; let write_usascii s_out p_out l_out max_chars w p k_out c_out = if k_out < l_out && c_out < max_chars then begin if p > 127 then begin let subst = w p in let l_subst = String.length subst in if k_out + l_subst <= l_out then begin (* Enough space to store 'subst': *) String.blit subst 0 s_out (k_out+p_out) l_subst; l_subst end else (* Not enough space: Stop this round of recoding *) -1 end else begin s_out.[p_out + k_out] <- Char.chr p; 1 end end else -1 (* End-of-buffer indicator *) ;; let write_8bit from_unicode s_out p_out l_out max_chars w p k_out c_out = if k_out < l_out && c_out < max_chars then begin let p' = match Array.unsafe_get from_unicode (p land 255) with Netmappings.U_nil -> -1 | Netmappings.U_single (p0,q0) -> if p0 = p then q0 else -1 | Netmappings.U_list l -> (try List.assoc p l with Not_found -> -1) in if p' < 0 then begin let subst = w p in let l_subst = String.length subst in if k_out + l_subst <= l_out then begin (* Enough space to store 'subst': *) String.blit subst 0 s_out (k_out+p_out) l_subst; l_subst end else (* Not enough space: Stop this round of recoding *) -1 end else begin s_out.[p_out + k_out] <- Char.chr p'; 1 end end else -1 (* End-of-buffer indicator *) ;; let write_utf8 is_java s_out p_out l_out max_chars w p k_out c_out = if p <= 127 && (not is_java || p <> 0) then begin if k_out < l_out && c_out < max_chars then begin s_out.[p_out + k_out] <- Char.chr p; 1 end else -1 end else if p <= 0x7ff then begin if k_out + 1 < l_out && c_out < max_chars then begin s_out.[p_out + k_out] <- Char.chr (0xc0 lor (p lsr 6)); s_out.[p_out + k_out + 1] <- Char.chr (0x80 lor (p land 0x3f)); 2 end else -1 end else if p <= 0xffff then begin (* Refuse writing surrogate pairs, and fffe, ffff *) if (p >= 0xd800 & p < 0xe000) or (p >= 0xfffe) then failwith "Netconversion.write_utf8"; if k_out + 2 < l_out && c_out < max_chars then begin s_out.[p_out + k_out] <- Char.chr (0xe0 lor (p lsr 12)); s_out.[p_out + k_out + 1] <- Char.chr (0x80 lor ((p lsr 6) land 0x3f)); s_out.[p_out + k_out + 2] <- Char.chr (0x80 lor (p land 0x3f)); 3 end else -1 end else if p <= 0x10ffff then begin if k_out + 3 < l_out && c_out < max_chars then begin s_out.[p_out + k_out] <- Char.chr (0xf0 lor (p lsr 18)); s_out.[p_out + k_out + 1] <- Char.chr (0x80 lor ((p lsr 12) land 0x3f)); s_out.[p_out + k_out + 2] <- Char.chr (0x80 lor ((p lsr 6) land 0x3f)); s_out.[p_out + k_out + 3] <- Char.chr (0x80 lor (p land 0x3f)); 4 end else -1 end else (* Higher code points are not possible in XML: *) failwith "Netconversion.write_utf8" ;; let write_utf16_le s_out p_out l_out max_chars w p k_out c_out = if p >= 0xfffe then begin if p <= 0xffff or p > 0x10ffff then failwith "Netconversion.write_utf16_le"; (* Must be written as surrogate pair *) if k_out + 3 < l_out && c_out < max_chars then begin let high = (p lsr 10) + 0xd800 in let low = (p land 0x3ff) + 0xdc00 in s_out.[p_out + k_out ] <- Char.chr (high land 0xff); s_out.[p_out + k_out + 1] <- Char.chr (high lsr 8); s_out.[p_out + k_out + 2] <- Char.chr (low land 0xff); s_out.[p_out + k_out + 3] <- Char.chr (low lsr 8); 4 end else -1 end else begin if k_out + 1 < l_out && c_out < max_chars then begin s_out.[p_out + k_out ] <- Char.chr (p land 0xff); s_out.[p_out + k_out + 1] <- Char.chr (p lsr 8); 2 end else -1 end ;; let write_utf16_be s_out p_out l_out max_chars w p k_out c_out = if p >= 0xfffe then begin if p <= 0xffff or p > 0x10ffff then failwith "Netconversion.write_utf16_be"; (* Must be written as surrogate pair *) if k_out + 3 < l_out && c_out < max_chars then begin let high = (p lsr 10) + 0xd800 in let low = (p land 0x3ff) + 0xdc00 in s_out.[p_out + k_out + 1] <- Char.chr (high land 0xff); s_out.[p_out + k_out ] <- Char.chr (high lsr 8); s_out.[p_out + k_out + 3] <- Char.chr (low land 0xff); s_out.[p_out + k_out + 2] <- Char.chr (low lsr 8); 4 end else -1 end else begin if k_out + 1 < l_out && c_out < max_chars then begin s_out.[p_out + k_out + 1] <- Char.chr (p land 0xff); s_out.[p_out + k_out ] <- Char.chr (p lsr 8); 2 end else -1 end ;; let recode ~in_enc ~in_buf ~in_pos ~in_len ~out_enc ~out_buf ~out_pos ~out_len ~max_chars ~subst = if (in_pos < 0 || in_len < 0 || in_pos + in_len > String.length in_buf || out_pos < 0 || out_len < 0 || out_pos + out_len > String.length out_buf) then invalid_arg "Netconversion.recode"; let reader = match in_enc with `Enc_iso88591 -> read_iso88591 | `Enc_usascii -> read_usascii | `Enc_utf8 -> read_utf8 false | `Enc_java -> read_utf8 true | `Enc_utf16 -> read_utf16 | `Enc_utf16_le -> read_utf16_le 0 | `Enc_utf16_be -> read_utf16_be 0 | _ -> (try let to_unicode' = Hashtbl.find Netmappings.to_unicode in_enc in let to_unicode = Netmappings.lock(); Lazy.force to_unicode' in Netmappings.unlock(); read_8bit to_unicode in_enc with Not_found -> failwith("Support for the encoding `" ^ string_of_encoding in_enc ^ "' has not been compiled into Netstring") ) in let writer = match out_enc with `Enc_iso88591 -> write_iso88591 out_buf out_pos out_len max_chars subst | `Enc_usascii -> write_usascii out_buf out_pos out_len max_chars subst | `Enc_utf8 -> write_utf8 false out_buf out_pos out_len max_chars subst | `Enc_java -> write_utf8 true out_buf out_pos out_len max_chars subst | `Enc_utf16 -> failwith "Netconversion.recode" | `Enc_utf16_le -> write_utf16_le out_buf out_pos out_len max_chars subst | `Enc_utf16_be -> write_utf16_be out_buf out_pos out_len max_chars subst | _ -> (try let from_unicode' = Hashtbl.find Netmappings.from_unicode out_enc in let from_unicode = Netmappings.lock(); Lazy.force from_unicode' in Netmappings.unlock(); write_8bit from_unicode out_buf out_pos out_len max_chars subst with Not_found -> failwith("Support for the encoding `" ^ string_of_encoding out_enc ^ "' has not been compiled into Netstring") ) in reader writer in_buf in_pos in_len ;; let makechar enc p = match enc with `Enc_iso88591 -> if p > 255 then raise Not_found; String.make 1 (Char.chr p) | `Enc_usascii -> if p > 127 then raise Not_found; String.make 1 (Char.chr p) | `Enc_utf8 -> let s = String.create 4 in let n = write_utf8 false s 0 4 1 (fun _ -> raise Not_found) p 0 0 in String.sub s 0 n | `Enc_java -> let s = String.create 4 in let n = write_utf8 true s 0 4 1 (fun _ -> raise Not_found) p 0 0 in String.sub s 0 n | `Enc_utf16_le -> let s = String.create 4 in let n = write_utf16_le s 0 4 1 (fun _ -> raise Not_found) p 0 0 in String.sub s 0 n | `Enc_utf16_be -> let s = String.create 4 in let n = write_utf16_be s 0 4 1 (fun _ -> raise Not_found) p 0 0 in String.sub s 0 n | `Enc_utf16 -> failwith "Netconversion.makechar" | _ -> let s = String.create 1 in let from_unicode' = try Hashtbl.find Netmappings.from_unicode enc with Not_found -> failwith("Support for the encoding `" ^ string_of_encoding enc ^ "' has not been compiled into Netstring") in let from_unicode = Netmappings.lock(); Lazy.force from_unicode' in Netmappings.unlock(); let n = write_8bit from_unicode s 0 1 1 (fun _ -> raise Not_found) p 0 0 in s ;; let recode_string ~in_enc ~out_enc ?(subst = (fun _ -> raise Not_found)) s = let length = String.length s in let size = 1024 in let out_buf = String.create size in let rec recode_loop k s_done in_enc = (* 'k' bytes of 's' have already been processed, and the result is in * 's_done'. *) (* Recode to 'out_buf': *) let in_len = length - k in let in_done, out_done, in_enc' = recode ~in_enc:in_enc ~in_buf:s ~in_pos:k ~in_len:in_len ~out_enc:out_enc ~out_buf:out_buf ~out_pos:0 ~out_len:size ~max_chars:size ~subst:subst in (* Collect the results: *) let k' = k + in_done in let s_done' = String.sub out_buf 0 out_done :: s_done in (* Still something to do? *) if k' < length then recode_loop k' s_done' in_enc' else (* No: Concatenate s_done' to get the final result. *) String.concat "" (List.rev s_done') in recode_loop 0 [] in_enc ;; (* ====================================================================== * History: * * $Log$ * Revision 1.1 2000/11/17 09:57:28 lpadovan * Initial revision * * Revision 1.2 2000/08/29 00:46:41 gerd * New type for the Unicode to 8 bit translation table. * The Netmappings tables are now Lazy.t. * * Revision 1.1 2000/08/13 00:02:57 gerd * Initial revision. * * * ====================================================================== * OLD LOGS FROM THE PXP PACKAGE (FILE NAME pxp_encoding.ml): * * Revision 1.5 2000/07/27 00:41:14 gerd * new 8 bit codes * * Revision 1.4 2000/07/04 22:11:41 gerd * Implemented the enhancements and extensions of * rev. 1.4 of pxp_encoding.mli. * * Revision 1.3 2000/05/29 23:48:38 gerd * Changed module names: * Markup_aux into Pxp_aux * Markup_codewriter into Pxp_codewriter * Markup_document into Pxp_document * Markup_dtd into Pxp_dtd * Markup_entity into Pxp_entity * Markup_lexer_types into Pxp_lexer_types * Markup_reader into Pxp_reader * Markup_types into Pxp_types * Markup_yacc into Pxp_yacc * See directory "compatibility" for (almost) compatible wrappers emulating * Markup_document, Markup_dtd, Markup_reader, Markup_types, and Markup_yacc. * * Revision 1.2 2000/05/29 21:14:57 gerd * Changed the type 'encoding' into a polymorphic variant. * * Revision 1.1 2000/05/20 20:30:50 gerd * Initial revision. * * *)