let make_iso enc = let s = ref "" in for i = 0 to 255 do let u = try Netconversion.makechar (enc :> Netconversion.encoding) i with Not_found -> "" in s := !s ^ u done; !s ;; let make_ucs2 start stop = let s = String.create ((stop - start) * 2) in for i = 0 to stop-start-1 do let k = 2 * i in let c = i + start in s.[k] <- Char.chr(c lsr 8); s.[k+1] <- Char.chr(c land 0xff); done; s ;; let make_ucs4 start stop = let s = String.create ((stop - start) * 4) in for i = 0 to stop-start-1 do let k = 4 * i in let c = i + start in s.[k] <- Char.chr(c lsr 24); s.[k+1] <- Char.chr((c lsr 16) land 0xff); s.[k+2] <- Char.chr((c lsr 8) land 0xff); s.[k+3] <- Char.chr(c land 0xff); done; s ;; let name_of_encoding enc = match enc with `Enc_iso88591 -> "ISO_8859-1" | `Enc_iso88592 -> "ISO_8859-2" | `Enc_iso88593 -> "ISO_8859-3" | `Enc_iso88594 -> "ISO_8859-4" | `Enc_iso88595 -> "ISO_8859-5" | `Enc_iso88596 -> "ISO_8859-6" | `Enc_iso88597 -> "ISO_8859-7" | `Enc_iso88598 -> "ISO_8859-8" | `Enc_iso88599 -> "ISO_8859-9" | `Enc_iso885910 -> "ISO_8859-10" | `Enc_iso885913 -> "ISO_8859-13" | `Enc_iso885914 -> "ISO_8859-14" | `Enc_iso885915 -> "ISO_8859-15" | `Enc_utf8 -> "UTF-8" | `Enc_ucs4 -> "UCS-4" | `Enc_ucs2 -> "UCS-2" | `Enc_utf16 -> "UTF-16" (* Note: GNU-iconv assumes big endian byte order *) ;; let iconv_recode_string in_enc out_enc in_s = let in_enc_name = name_of_encoding in_enc in let out_enc_name = name_of_encoding out_enc in let out_s = ref "" in let out_ch,in_ch = Unix.open_process ("iconv -f " ^ in_enc_name ^ " -t " ^ out_enc_name) in (* Write in_s to in_ch in a new thread: *) ignore (Thread.create (fun () -> output_string in_ch in_s; close_out in_ch; ) () ); (* Read the result in the current thread: *) let buf = String.create 1024 in let n = ref 1 in while !n <> 0 do let n' = input out_ch buf 0 1024 in out_s := !out_s ^ String.sub buf 0 n'; n := n' done; ignore(Unix.close_process (out_ch,in_ch)); !out_s ;; let test_iso_and_utf8 enc = let name = name_of_encoding enc in print_string ("Recode: " ^ name ^ " and UTF-8... "); flush stdout; let s = make_iso enc in let s1' = Netconversion.recode_string (enc :> Netconversion.encoding) `Enc_utf8 s in let s2' = iconv_recode_string enc `Enc_utf8 s in assert(s1' = s2'); let s1 = Netconversion.recode_string `Enc_utf8 (enc :> Netconversion.encoding) s1' in let s2 = iconv_recode_string `Enc_utf8 enc s1' in assert(s1 = s2 && s1 = s); print_endline "OK"; flush stdout ;; let test_utf16_and_utf8_0000_d7ff () = print_string "Recode: UTF-16-BE and UTF-8, #0000-#D7FF... "; flush stdout; let s = make_ucs2 0 0xd800 in let s1' = Netconversion.recode_string `Enc_utf16_be `Enc_utf8 s in let s2' = iconv_recode_string `Enc_utf16 `Enc_utf8 s in assert(s1' = s2'); let s1 = Netconversion.recode_string `Enc_utf8 `Enc_utf16_be s1' in let s2 = iconv_recode_string `Enc_utf8 `Enc_utf16 s1' in assert(s1 = s2 && s1 = s); print_endline "OK"; flush stdout ;; let test_utf16_and_utf8_e000_fffd () = print_string "Recode: UTF-16-BE and UTF-8, #E000-#FFFD... "; flush stdout; let s = make_ucs2 0xe000 0xfffe in let s1' = Netconversion.recode_string `Enc_utf16_be `Enc_utf8 s in let s2' = iconv_recode_string `Enc_utf16 `Enc_utf8 s in assert(s1' = s2'); let s1 = Netconversion.recode_string `Enc_utf8 `Enc_utf16_be s1' in let s2 = iconv_recode_string `Enc_utf8 `Enc_utf16 s1' in assert(s1 = s2 && s1 = s); print_endline "OK"; flush stdout ;; let test_utf16_and_utf8_10000_10FFFF () = print_string "Recode: UTF-16-BE and UTF-8, #10000-#10FFFF... "; flush stdout; for i = 1 to 16 do let s0 = make_ucs4 (i * 0x10000) (i * 0x10000 + 0x10000) in let s = iconv_recode_string `Enc_ucs4 `Enc_utf16 s0 in let s1' = Netconversion.recode_string `Enc_utf16_be `Enc_utf8 s in let s2' = iconv_recode_string `Enc_utf16 `Enc_utf8 s in assert(s1' = s2'); let s1 = Netconversion.recode_string `Enc_utf8 `Enc_utf16_be s1' in let s2 = iconv_recode_string `Enc_utf8 `Enc_utf16 s1' in assert(s1 = s2 && s1 = s); print_string "+"; flush stdout; done; print_endline "OK"; flush stdout ;; print_endline "Warning: You need the command 'iconv' to run this test!"; flush stdout; test_iso_and_utf8 `Enc_iso88591; test_iso_and_utf8 `Enc_iso88592; test_iso_and_utf8 `Enc_iso88593; test_iso_and_utf8 `Enc_iso88594; test_iso_and_utf8 `Enc_iso88595; test_iso_and_utf8 `Enc_iso88596; test_iso_and_utf8 `Enc_iso88597; (* test_iso_and_utf8 `Enc_iso88598; *) test_iso_and_utf8 `Enc_iso88599; test_iso_and_utf8 `Enc_iso885910; (* test_iso_and_utf8 `Enc_iso885913; *) (* test_iso_and_utf8 `Enc_iso885914; *) (* test_iso_and_utf8 `Enc_iso885915; *) test_utf16_and_utf8_0000_d7ff(); test_utf16_and_utf8_e000_fffd(); (* This test does not work because iconv does not support the surrogate * representation of UTF-16: * test_utf16_and_utf8_10000_10FFFF(); *) () ;;