2 * ----------------------------------------------------------------------
5 exception Malformed_code
9 [ `Enc_utf8 (* UTF-8 *)
11 | `Enc_utf16 (* UTF-16 with unspecified endianess (restricted usage) *)
12 | `Enc_utf16_le (* UTF-16 little endian *)
13 | `Enc_utf16_be (* UTF-16 big endian *)
14 | `Enc_usascii (* US-ASCII (only 7 bit) *)
15 | `Enc_iso88591 (* ISO-8859-1 *)
16 | `Enc_iso88592 (* ISO-8859-2 *)
17 | `Enc_iso88593 (* ISO-8859-3 *)
18 | `Enc_iso88594 (* ISO-8859-4 *)
19 | `Enc_iso88595 (* ISO-8859-5 *)
20 | `Enc_iso88596 (* ISO-8859-6 *)
21 | `Enc_iso88597 (* ISO-8859-7 *)
22 | `Enc_iso88598 (* ISO-8859-8 *)
23 | `Enc_iso88599 (* ISO-8859-9 *)
24 | `Enc_iso885910 (* ISO-8859-10 *)
25 | `Enc_iso885913 (* ISO-8859-13 *)
26 | `Enc_iso885914 (* ISO-8859-14 *)
27 | `Enc_iso885915 (* ISO-8859-15 *)
28 | `Enc_koi8r (* KOI8-R *)
29 | `Enc_jis0201 (* JIS-0201 *)
31 | `Enc_windows1250 (* WINDOWS-1250 *)
32 | `Enc_windows1251 (* WINDOWS-1251 *)
33 | `Enc_windows1252 (* WINDOWS-1252 *)
34 | `Enc_windows1253 (* WINDOWS-1253 *)
35 | `Enc_windows1254 (* WINDOWS-1254 *)
36 | `Enc_windows1255 (* WINDOWS-1255 *)
37 | `Enc_windows1256 (* WINDOWS-1256 *)
38 | `Enc_windows1257 (* WINDOWS-1257 *)
39 | `Enc_windows1258 (* WINDOWS-1258 *)
40 (* IBM, ASCII-based: *)
59 (* IBM, EBCDIC-based: *)
66 | `Enc_adobe_standard_encoding
67 | `Enc_adobe_symbol_encoding
68 | `Enc_adobe_zapf_dingbats_encoding
77 (* Removes some characters from e; uppercase *)
78 let e' = String.create (String.length e) in
80 if i < String.length e then
82 ('-'|'_'|'.') -> next (i+1) j
83 | c -> e'.[j] <- c; next (i+1) (j+1)
88 String.uppercase(String.sub e' 0 l)
92 let encoding_of_string e =
93 match norm_enc_name e with
94 ("UTF16"|"UCS2"|"ISO10646UCS2") -> `Enc_utf16
95 | "UTF16BE" -> `Enc_utf16_be
96 | "UTF16LE" -> `Enc_utf16_le
98 | ("UTF8JAVA"|"JAVA") -> `Enc_java
99 | ("USASCII"|"ASCII"|"ISO646US"|"IBM367"|"CP367") -> `Enc_usascii
100 | ("ISO88591"|"LATIN1"|"IBM819"|"CP819") -> `Enc_iso88591
101 | ("ISO88592"|"LATIN2") -> `Enc_iso88592
102 | ("ISO88593"|"LATIN3") -> `Enc_iso88593
103 | ("ISO88594"|"LATIN4") -> `Enc_iso88594
104 | ("ISO88595"|"CYRILLIC") -> `Enc_iso88595
105 | ("ISO88596"|"ARABIC"|"ECMA114"|"ASMO708") -> `Enc_iso88596
106 | ("ISO88597"|"GREEK"|"GREEK8"|"ELOT928"|"ECMA118") -> `Enc_iso88597
107 | ("ISO88598"|"HEBREW") -> `Enc_iso88598
108 | ("ISO88599"|"LATIN5") -> `Enc_iso88599
109 | ("ISO885910"|"LATIN6") -> `Enc_iso885910
110 | "ISO885913" -> `Enc_iso885913
111 | "ISO885914" -> `Enc_iso885914
112 | "ISO885915" -> `Enc_iso885915
113 | "KOI8R" -> `Enc_koi8r
114 | ("JIS0201"|"JISX0201"|"X0201") -> `Enc_jis0201
116 | "WINDOWS1250" -> `Enc_windows1250
117 | "WINDOWS1251" -> `Enc_windows1251
118 | "WINDOWS1252" -> `Enc_windows1252
119 | "WINDOWS1253" -> `Enc_windows1253
120 | "WINDOWS1254" -> `Enc_windows1254
121 | "WINDOWS1255" -> `Enc_windows1255
122 | "WINDOWS1256" -> `Enc_windows1256
123 | "WINDOWS1257" -> `Enc_windows1257
124 | "WINDOWS1258" -> `Enc_windows1258
126 | ("CP437"|"IBM437") -> `Enc_cp437
127 | ("CP737"|"IBM737") -> `Enc_cp737
128 | ("CP775"|"IBM775") -> `Enc_cp775
129 | ("CP850"|"IBM850") -> `Enc_cp850
130 | ("CP852"|"IBM852") -> `Enc_cp852
131 | ("CP855"|"IBM855") -> `Enc_cp855
132 | ("CP856"|"IBM856") -> `Enc_cp856
133 | ("CP857"|"IBM857") -> `Enc_cp857
134 | ("CP860"|"IBM860") -> `Enc_cp860
135 | ("CP861"|"IBM861") -> `Enc_cp861
136 | ("CP862"|"IBM862") -> `Enc_cp862
137 | ("CP863"|"IBM863") -> `Enc_cp863
138 | ("CP864"|"IBM864") -> `Enc_cp864
139 | ("CP865"|"IBM865") -> `Enc_cp865
140 | ("CP866"|"IBM866") -> `Enc_cp866
141 | ("CP869"|"IBM869") -> `Enc_cp869
142 | ("CP874"|"IBM874") -> `Enc_cp874
143 | ("CP1006"|"IBM1006") -> `Enc_cp1006
145 | ("CP037"|"IBM037"|"EBCDICCPUS"|"EBCDICCPCA"|"EBCDICCPWT"|
146 "EBCDICCPNL") -> `Enc_cp037
147 | ("CP424"|"IBM424"|"EBCDICCPHE") -> `Enc_cp424
148 | ("CP500"|"IBM500"|"EBCDICCPBE"|"EBCDICCPCH") -> `Enc_cp500
149 | ("CP875"|"IBM875") -> `Enc_cp875
150 | ("CP1026"|"IBM1026") -> `Enc_cp1026
152 | "ADOBESTANDARDENCODING" -> `Enc_adobe_standard_encoding
153 | "ADOBESYMBOLENCODING" -> `Enc_adobe_symbol_encoding
154 | "ADOBEZAPFDINGBATSENCODING" -> `Enc_adobe_zapf_dingbats_encoding
156 | "MACINTOSH" -> `Enc_macroman
159 failwith "Netconversion.encoding_of_string: unknown encoding"
163 let string_of_encoding (e : encoding) =
164 (* If there is a "preferred MIME name", this name is returned (see IANA). *)
166 `Enc_utf16 -> "UTF-16"
167 | `Enc_utf16_be -> "UTF-16-BE"
168 | `Enc_utf16_le -> "UTF-16-LE"
169 | `Enc_utf8 -> "UTF-8"
170 | `Enc_java -> "UTF-8-JAVA"
171 | `Enc_usascii -> "US-ASCII"
172 | `Enc_iso88591 -> "ISO-8859-1"
173 | `Enc_iso88592 -> "ISO-8859-2"
174 | `Enc_iso88593 -> "ISO-8859-3"
175 | `Enc_iso88594 -> "ISO-8859-4"
176 | `Enc_iso88595 -> "ISO-8859-5"
177 | `Enc_iso88596 -> "ISO-8859-6"
178 | `Enc_iso88597 -> "ISO-8859-7"
179 | `Enc_iso88598 -> "ISO-8859-8"
180 | `Enc_iso88599 -> "ISO-8859-9"
181 | `Enc_iso885910 -> "ISO-8859-10"
182 | `Enc_iso885913 -> "ISO-8859-13"
183 | `Enc_iso885914 -> "ISO-8859-14"
184 | `Enc_iso885915 -> "ISO-8859-15"
185 | `Enc_koi8r -> "KOI8-R"
186 | `Enc_jis0201 -> "JIS_X0201"
187 | `Enc_windows1250 -> "WINDOWS-1250"
188 | `Enc_windows1251 -> "WINDOWS-1251"
189 | `Enc_windows1252 -> "WINDOWS-1252"
190 | `Enc_windows1253 -> "WINDOWS-1253"
191 | `Enc_windows1254 -> "WINDOWS-1254"
192 | `Enc_windows1255 -> "WINDOWS-1255"
193 | `Enc_windows1256 -> "WINDOWS-1256"
194 | `Enc_windows1257 -> "WINDOWS-1257"
195 | `Enc_windows1258 -> "WINDOWS-1258"
196 | `Enc_cp437 -> "CP437"
197 | `Enc_cp737 -> "CP737"
198 | `Enc_cp775 -> "CP775"
199 | `Enc_cp850 -> "CP850"
200 | `Enc_cp852 -> "CP852"
201 | `Enc_cp855 -> "CP855"
202 | `Enc_cp856 -> "CP856"
203 | `Enc_cp857 -> "CP857"
204 | `Enc_cp860 -> "CP860"
205 | `Enc_cp861 -> "CP861"
206 | `Enc_cp862 -> "CP862"
207 | `Enc_cp863 -> "CP863"
208 | `Enc_cp864 -> "CP864"
209 | `Enc_cp865 -> "CP865"
210 | `Enc_cp866 -> "CP866"
211 | `Enc_cp869 -> "CP869"
212 | `Enc_cp874 -> "CP874"
213 | `Enc_cp1006 -> "CP1006"
214 | `Enc_cp037 -> "CP037"
215 | `Enc_cp424 -> "CP424"
216 | `Enc_cp500 -> "CP500"
217 | `Enc_cp875 -> "CP875"
218 | `Enc_cp1026 -> "CP1026"
219 | `Enc_adobe_standard_encoding -> "ADOBE-STANDARD-ENCODING"
220 | `Enc_adobe_symbol_encoding -> "ADOBE-SYMBOL-ENCODING"
221 | `Enc_adobe_zapf_dingbats_encoding -> "ADOBE-ZAPF-DINGBATS-ENCODING"
222 | `Enc_macroman -> "MACINTOSH"
226 let read_iso88591 write s_in p_in l_in =
227 let rec scan k_in k_out c_out =
228 if k_in < l_in then begin
229 let p = Char.code s_in.[p_in + k_in] in
230 let n = write p k_out c_out in
232 k_in, k_out, `Enc_iso88591
234 scan (k_in + 1) (k_out + n) (c_out + 1)
237 k_in, k_out, `Enc_iso88591
243 let read_usascii write s_in p_in l_in =
244 let rec scan k_in k_out c_out =
245 if k_in < l_in then begin
246 let p = Char.code s_in.[p_in + k_in] in
247 if p >= 0x80 then raise Malformed_code;
248 let n = write p k_out c_out in
250 k_in, k_out, `Enc_usascii
252 scan (k_in + 1) (k_out + n) (c_out + 1)
255 k_in, k_out, `Enc_usascii
261 let read_8bit m_to_unicode enc write s_in p_in l_in =
262 let rec scan k_in k_out c_out =
263 if k_in < l_in then begin
264 let p_local = Char.code s_in.[p_in + k_in] in
265 let p_uni = Array.unsafe_get m_to_unicode p_local in
266 if p_uni < 0 then raise Malformed_code;
267 let n = write p_uni k_out c_out in
271 scan (k_in + 1) (k_out + n) (c_out + 1)
280 let read_utf8 is_java write s_in p_in l_in =
281 let rec scan k_in k_out c_out =
282 if k_in < l_in then begin
284 match s_in.[p_in + k_in] with
286 if is_java then raise Malformed_code;
287 write 0 k_out c_out, 1
288 | ('\001'..'\127' as c) ->
289 write (Char.code c) k_out c_out, 1
290 | ('\128'..'\223' as c) ->
291 if k_in + 1 >= l_in then
294 let n1 = Char.code c in
295 let n2 = Char.code (s_in.[p_in + k_in + 1]) in
296 if is_java && (n1 = 0x80 && n2 = 0xc0) then
297 write 0 k_out c_out, 2
299 if n2 < 128 or n2 > 191 then raise Malformed_code;
300 let p = ((n1 land 0b11111) lsl 6) lor (n2 land 0b111111) in
301 if p < 128 then raise Malformed_code;
302 write p k_out c_out, 2
305 | ('\224'..'\239' as c) ->
306 if k_in + 2 >= l_in then
309 let n1 = Char.code c in
310 let n2 = Char.code (s_in.[p_in + k_in + 1]) in
311 let n3 = Char.code (s_in.[p_in + k_in + 2]) in
312 if n2 < 128 or n2 > 191 then raise Malformed_code;
313 if n3 < 128 or n3 > 191 then raise Malformed_code;
315 ((n1 land 0b1111) lsl 12) lor
316 ((n2 land 0b111111) lsl 6) lor
319 if p < 0x800 then raise Malformed_code;
320 if (p >= 0xd800 && p < 0xe000) then
321 (* Surrogate pairs are not supported in UTF-8 *)
322 raise Malformed_code;
323 if (p >= 0xfffe && p <= 0xffff) then
324 raise Malformed_code;
325 write p k_out c_out, 3
327 | ('\240'..'\247' as c) ->
328 if k_in + 3 >= l_in then
331 let n1 = Char.code c in
332 let n2 = Char.code (s_in.[p_in + k_in + 1]) in
333 let n3 = Char.code (s_in.[p_in + k_in + 2]) in
334 let n4 = Char.code (s_in.[p_in + k_in + 3]) in
335 if n2 < 128 or n2 > 191 then raise Malformed_code;
336 if n3 < 128 or n3 > 191 then raise Malformed_code;
337 if n4 < 128 or n4 > 191 then raise Malformed_code;
338 let p = ((n1 land 0b111) lsl 18) lor
339 ((n2 land 0b111111) lsl 12) lor
340 ((n3 land 0b111111) lsl 6) lor
343 if p < 0x10000 then raise Malformed_code;
344 if p >= 0x110000 then
345 (* These code points are not supported. *)
346 raise Malformed_code;
347 write p k_out c_out, 4
350 (* Outside the valid range of XML characters *)
351 raise Malformed_code;
353 (* n_out: number of written bytes; -1 means out buf is full
354 * n_in: number of read bytes; 0 means end of in buf reached
355 * n_in = 0 implies n_out = -1
358 k_in, k_out, `Enc_utf8
360 scan (k_in + n_in) (k_out + n_out) (c_out + 1)
363 k_in, k_out, `Enc_utf8
369 let surrogate_offset = 0x10000 - (0xD800 lsl 10) - 0xDC00;;
371 let read_utf16_le k_in_0 write s_in p_in l_in =
372 let rec scan k_in k_out c_out =
373 if k_in + 1 < l_in then begin
374 let p = (Char.code s_in.[p_in + k_in]) lor ((Char.code s_in.[p_in + k_in + 1]) lsl 8) in
376 if p >= 0xd800 & p < 0xe000 then begin
377 (* This is a surrogate pair. *)
378 if k_in + 3 < l_in then begin
379 if p <= 0xdbff then begin
380 let q = (Char.code s_in.[p_in + k_in + 2 ]) lor
381 ((Char.code s_in.[p_in + k_in + 3]) lsl 8) in
382 if q < 0xdc00 or q > 0xdfff then raise Malformed_code;
383 let eff_p = (p lsl 10) + q + surrogate_offset in
384 let n = write eff_p k_out c_out in
386 k_in, k_out, `Enc_utf16_le
388 scan (k_in + 4) (k_out + n) (c_out + 1)
391 (* Malformed pair: *)
392 raise Malformed_code;
395 (* Incomplete pair: *)
396 k_in, k_out, `Enc_utf16_le
401 (* Big endian byte order mark: It is illegal here *)
404 (* A regular code point *)
405 let n = write p k_out c_out in
407 k_in, k_out, `Enc_utf16_le
409 scan (k_in + 2) (k_out + n) (c_out + 1)
413 (* Incomplete character: *)
414 k_in, k_out, `Enc_utf16_le
420 let read_utf16_be k_in_0 write s_in p_in l_in =
421 let rec scan k_in k_out c_out =
422 if k_in + 1 < l_in then begin
423 let p = (Char.code s_in.[p_in + k_in + 1]) lor ((Char.code s_in.[p_in + k_in]) lsl 8) in
425 if p >= 0xd800 & p < 0xe000 then begin
426 (* This is a surrogate pair. *)
427 if k_in + 3 < l_in then begin
428 if p <= 0xdbff then begin
429 let q = (Char.code s_in.[p_in + k_in + 3 ]) lor
430 ((Char.code s_in.[p_in + k_in + 2]) lsl 8) in
431 if q < 0xdc00 or q > 0xdfff then raise Malformed_code;
432 let eff_p = (p lsl 10) + q + surrogate_offset in
433 let n = write eff_p k_out c_out in
435 k_in, k_out, `Enc_utf16_be
437 scan (k_in + 4) (k_out + n) (c_out + 1)
440 (* Malformed pair: *)
441 raise Malformed_code;
444 (* Incomplete pair: *)
445 k_in, k_out, `Enc_utf16_be
450 (* Little endian byte order mark: It is illegal here *)
453 (* A regular code point *)
454 let n = write p k_out c_out in
456 k_in, k_out, `Enc_utf16_be
458 scan (k_in + 2) (k_out + n) (c_out + 1)
463 (* Incomplete character: *)
464 k_in, k_out, `Enc_utf16_be
470 let read_utf16 write s_in p_in l_in =
471 (* Expect a BOM at the beginning of the text *)
472 if l_in >= 2 then begin
473 let c0 = s_in.[p_in + 0] in
474 let c1 = s_in.[p_in + 1] in
475 if c0 = '\254' & c1 = '\255' then begin
476 (* 0xfeff as big endian *)
477 read_utf16_be 2 write s_in p_in l_in
480 if c0 = '\255' & c1 = '\254' then begin
481 (* 0xfeff as little endian *)
482 read_utf16_le 2 write s_in p_in l_in
485 (* byte order mark missing *)
493 let write_iso88591 s_out p_out l_out max_chars w p k_out c_out =
494 if k_out < l_out && c_out < max_chars then begin
495 if p > 255 then begin
497 let l_subst = String.length subst in
498 if k_out + l_subst <= l_out then begin
499 (* Enough space to store 'subst': *)
500 String.blit subst 0 s_out (k_out+p_out) l_subst;
504 (* Not enough space: Stop this round of recoding *)
508 s_out.[p_out + k_out] <- Char.chr p;
513 -1 (* End-of-buffer indicator *)
517 let write_usascii s_out p_out l_out max_chars w p k_out c_out =
518 if k_out < l_out && c_out < max_chars then begin
519 if p > 127 then begin
521 let l_subst = String.length subst in
522 if k_out + l_subst <= l_out then begin
523 (* Enough space to store 'subst': *)
524 String.blit subst 0 s_out (k_out+p_out) l_subst;
528 (* Not enough space: Stop this round of recoding *)
532 s_out.[p_out + k_out] <- Char.chr p;
537 -1 (* End-of-buffer indicator *)
541 let write_8bit from_unicode s_out p_out l_out max_chars w p k_out c_out =
542 if k_out < l_out && c_out < max_chars then begin
544 match Array.unsafe_get from_unicode (p land 255) with
545 Netmappings.U_nil -> -1
546 | Netmappings.U_single (p0,q0) ->
547 if p0 = p then q0 else -1
548 | Netmappings.U_list l ->
549 (try List.assoc p l with Not_found -> -1)
553 let l_subst = String.length subst in
554 if k_out + l_subst <= l_out then begin
555 (* Enough space to store 'subst': *)
556 String.blit subst 0 s_out (k_out+p_out) l_subst;
560 (* Not enough space: Stop this round of recoding *)
564 s_out.[p_out + k_out] <- Char.chr p';
569 -1 (* End-of-buffer indicator *)
573 let write_utf8 is_java s_out p_out l_out max_chars w p k_out c_out =
574 if p <= 127 && (not is_java || p <> 0) then begin
575 if k_out < l_out && c_out < max_chars then begin
576 s_out.[p_out + k_out] <- Char.chr p;
581 else if p <= 0x7ff then begin
582 if k_out + 1 < l_out && c_out < max_chars then begin
583 s_out.[p_out + k_out] <- Char.chr (0xc0 lor (p lsr 6));
584 s_out.[p_out + k_out + 1] <- Char.chr (0x80 lor (p land 0x3f));
589 else if p <= 0xffff then begin
590 (* Refuse writing surrogate pairs, and fffe, ffff *)
591 if (p >= 0xd800 & p < 0xe000) or (p >= 0xfffe) then
592 failwith "Netconversion.write_utf8";
593 if k_out + 2 < l_out && c_out < max_chars then begin
594 s_out.[p_out + k_out] <- Char.chr (0xe0 lor (p lsr 12));
595 s_out.[p_out + k_out + 1] <- Char.chr (0x80 lor ((p lsr 6) land 0x3f));
596 s_out.[p_out + k_out + 2] <- Char.chr (0x80 lor (p land 0x3f));
601 else if p <= 0x10ffff then begin
602 if k_out + 3 < l_out && c_out < max_chars then begin
603 s_out.[p_out + k_out] <- Char.chr (0xf0 lor (p lsr 18));
604 s_out.[p_out + k_out + 1] <- Char.chr (0x80 lor ((p lsr 12) land 0x3f));
605 s_out.[p_out + k_out + 2] <- Char.chr (0x80 lor ((p lsr 6) land 0x3f));
606 s_out.[p_out + k_out + 3] <- Char.chr (0x80 lor (p land 0x3f));
612 (* Higher code points are not possible in XML: *)
613 failwith "Netconversion.write_utf8"
617 let write_utf16_le s_out p_out l_out max_chars w p k_out c_out =
618 if p >= 0xfffe then begin
619 if p <= 0xffff or p > 0x10ffff then failwith "Netconversion.write_utf16_le";
620 (* Must be written as surrogate pair *)
621 if k_out + 3 < l_out && c_out < max_chars then begin
622 let high = (p lsr 10) + 0xd800 in
623 let low = (p land 0x3ff) + 0xdc00 in
624 s_out.[p_out + k_out ] <- Char.chr (high land 0xff);
625 s_out.[p_out + k_out + 1] <- Char.chr (high lsr 8);
626 s_out.[p_out + k_out + 2] <- Char.chr (low land 0xff);
627 s_out.[p_out + k_out + 3] <- Char.chr (low lsr 8);
633 if k_out + 1 < l_out && c_out < max_chars then begin
634 s_out.[p_out + k_out ] <- Char.chr (p land 0xff);
635 s_out.[p_out + k_out + 1] <- Char.chr (p lsr 8);
644 let write_utf16_be s_out p_out l_out max_chars w p k_out c_out =
645 if p >= 0xfffe then begin
646 if p <= 0xffff or p > 0x10ffff then failwith "Netconversion.write_utf16_be";
647 (* Must be written as surrogate pair *)
648 if k_out + 3 < l_out && c_out < max_chars then begin
649 let high = (p lsr 10) + 0xd800 in
650 let low = (p land 0x3ff) + 0xdc00 in
651 s_out.[p_out + k_out + 1] <- Char.chr (high land 0xff);
652 s_out.[p_out + k_out ] <- Char.chr (high lsr 8);
653 s_out.[p_out + k_out + 3] <- Char.chr (low land 0xff);
654 s_out.[p_out + k_out + 2] <- Char.chr (low lsr 8);
660 if k_out + 1 < l_out && c_out < max_chars then begin
661 s_out.[p_out + k_out + 1] <- Char.chr (p land 0xff);
662 s_out.[p_out + k_out ] <- Char.chr (p lsr 8);
681 if (in_pos < 0 || in_len < 0 || in_pos + in_len > String.length in_buf ||
682 out_pos < 0 || out_len < 0 || out_pos + out_len > String.length out_buf)
684 invalid_arg "Netconversion.recode";
688 `Enc_iso88591 -> read_iso88591
689 | `Enc_usascii -> read_usascii
690 | `Enc_utf8 -> read_utf8 false
691 | `Enc_java -> read_utf8 true
692 | `Enc_utf16 -> read_utf16
693 | `Enc_utf16_le -> read_utf16_le 0
694 | `Enc_utf16_be -> read_utf16_be 0
697 let to_unicode' = Hashtbl.find Netmappings.to_unicode in_enc in
700 Lazy.force to_unicode' in
701 Netmappings.unlock();
702 read_8bit to_unicode in_enc
705 failwith("Support for the encoding `" ^
706 string_of_encoding in_enc ^
707 "' has not been compiled into Netstring")
712 `Enc_iso88591 -> write_iso88591 out_buf out_pos out_len max_chars subst
713 | `Enc_usascii -> write_usascii out_buf out_pos out_len max_chars subst
714 | `Enc_utf8 -> write_utf8 false
715 out_buf out_pos out_len max_chars subst
716 | `Enc_java -> write_utf8 true out_buf out_pos out_len max_chars subst
717 | `Enc_utf16 -> failwith "Netconversion.recode"
718 | `Enc_utf16_le -> write_utf16_le out_buf out_pos out_len max_chars subst
719 | `Enc_utf16_be -> write_utf16_be out_buf out_pos out_len max_chars subst
722 let from_unicode' = Hashtbl.find Netmappings.from_unicode out_enc
726 Lazy.force from_unicode' in
727 Netmappings.unlock();
728 write_8bit from_unicode out_buf out_pos out_len max_chars subst
731 failwith("Support for the encoding `" ^
732 string_of_encoding out_enc ^
733 "' has not been compiled into Netstring")
736 reader writer in_buf in_pos in_len
743 if p > 255 then raise Not_found;
744 String.make 1 (Char.chr p)
746 if p > 127 then raise Not_found;
747 String.make 1 (Char.chr p)
749 let s = String.create 4 in
750 let n = write_utf8 false s 0 4 1 (fun _ -> raise Not_found) p 0 0 in
753 let s = String.create 4 in
754 let n = write_utf8 true s 0 4 1 (fun _ -> raise Not_found) p 0 0 in
757 let s = String.create 4 in
758 let n = write_utf16_le s 0 4 1 (fun _ -> raise Not_found) p 0 0 in
761 let s = String.create 4 in
762 let n = write_utf16_be s 0 4 1 (fun _ -> raise Not_found) p 0 0 in
765 failwith "Netconversion.makechar"
767 let s = String.create 1 in
770 Hashtbl.find Netmappings.from_unicode enc
773 failwith("Support for the encoding `" ^
774 string_of_encoding enc ^
775 "' has not been compiled into Netstring")
779 Lazy.force from_unicode' in
780 Netmappings.unlock();
782 write_8bit from_unicode s 0 1 1 (fun _ -> raise Not_found) p 0 0 in
787 let recode_string ~in_enc ~out_enc ?(subst = (fun _ -> raise Not_found)) s =
789 let length = String.length s in
791 let out_buf = String.create size in
793 let rec recode_loop k s_done in_enc =
794 (* 'k' bytes of 's' have already been processed, and the result is in
797 (* Recode to 'out_buf': *)
798 let in_len = length - k in
799 let in_done, out_done, in_enc' =
800 recode ~in_enc:in_enc ~in_buf:s ~in_pos:k ~in_len:in_len
801 ~out_enc:out_enc ~out_buf:out_buf ~out_pos:0 ~out_len:size
802 ~max_chars:size ~subst:subst in
803 (* Collect the results: *)
804 let k' = k + in_done in
805 let s_done' = String.sub out_buf 0 out_done :: s_done in
806 (* Still something to do? *)
808 recode_loop k' s_done' in_enc'
810 (* No: Concatenate s_done' to get the final result. *)
811 String.concat "" (List.rev s_done')
814 recode_loop 0 [] in_enc
818 (* ======================================================================
822 * Revision 1.1 2000/11/17 09:57:28 lpadovan
825 * Revision 1.2 2000/08/29 00:46:41 gerd
826 * New type for the Unicode to 8 bit translation table.
827 * The Netmappings tables are now Lazy.t.
829 * Revision 1.1 2000/08/13 00:02:57 gerd
833 * ======================================================================
834 * OLD LOGS FROM THE PXP PACKAGE (FILE NAME pxp_encoding.ml):
836 * Revision 1.5 2000/07/27 00:41:14 gerd
839 * Revision 1.4 2000/07/04 22:11:41 gerd
840 * Implemented the enhancements and extensions of
841 * rev. 1.4 of pxp_encoding.mli.
843 * Revision 1.3 2000/05/29 23:48:38 gerd
844 * Changed module names:
845 * Markup_aux into Pxp_aux
846 * Markup_codewriter into Pxp_codewriter
847 * Markup_document into Pxp_document
848 * Markup_dtd into Pxp_dtd
849 * Markup_entity into Pxp_entity
850 * Markup_lexer_types into Pxp_lexer_types
851 * Markup_reader into Pxp_reader
852 * Markup_types into Pxp_types
853 * Markup_yacc into Pxp_yacc
854 * See directory "compatibility" for (almost) compatible wrappers emulating
855 * Markup_document, Markup_dtd, Markup_reader, Markup_types, and Markup_yacc.
857 * Revision 1.2 2000/05/29 21:14:57 gerd
858 * Changed the type 'encoding' into a polymorphic variant.
860 * Revision 1.1 2000/05/20 20:30:50 gerd