2 * ----------------------------------------------------------------------
5 exception Malformed_code
8 * - With the exception of UTF-8 and UTF-16, only single-byte character sets
10 * - I took the mappings from www.unicode.org, and the standard names of
11 * the character sets from IANA. Obviously, many character sets are missing
12 * that can be supported; especially ISO646 character sets, many EBCDIC
14 * - Because of the copyright statement from Unicode, I cannot put the
15 * source tables that describe the mappings into the distribution. They
16 * are publicly available from www.unicode.org.
17 * - Because of this, it is difficult for you to extend the list of character
18 * sets; you need the source tables I am not allowed to distribute.
19 * These tables have a very simple format: Every line describes a pair
20 * of code points; the left code (<= 0xff) is the code in the character
21 * set, the right code (<= 0xffff) is the Unicode equivalent.
23 * http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-2.TXT
24 * You can send me such files, and I will integrate them into the
25 * distribution (if possible).
26 * - I really do not know very much about the character sets used in
27 * East Asia. If you need them, please write the necessary conversion
28 * functions and send them to me.
31 * - The following charsets do not have a bijective mapping to Unicode:
32 * adobe_standard_encoding, adobe_symbol_encoding,
33 * adobe_zapf_dingbats_encoding, cp1002 (0xFEBE). The current implementation
34 * simply removes one of the conflicting code point pairs - this might
39 [ `Enc_utf8 (* UTF-8 *)
40 | `Enc_java (* The variant of UTF-8 used by Java *)
41 | `Enc_utf16 (* UTF-16 with unspecified endianess (restricted usage) *)
42 | `Enc_utf16_le (* UTF-16 little endian *)
43 | `Enc_utf16_be (* UTF-16 big endian *)
44 | `Enc_usascii (* US-ASCII (only 7 bit) *)
45 | `Enc_iso88591 (* ISO-8859-1 *)
46 | `Enc_iso88592 (* ISO-8859-2 *)
47 | `Enc_iso88593 (* ISO-8859-3 *)
48 | `Enc_iso88594 (* ISO-8859-4 *)
49 | `Enc_iso88595 (* ISO-8859-5 *)
50 | `Enc_iso88596 (* ISO-8859-6 *)
51 | `Enc_iso88597 (* ISO-8859-7 *)
52 | `Enc_iso88598 (* ISO-8859-8 *)
53 | `Enc_iso88599 (* ISO-8859-9 *)
54 | `Enc_iso885910 (* ISO-8859-10 *)
55 | `Enc_iso885913 (* ISO-8859-13 *)
56 | `Enc_iso885914 (* ISO-8859-14 *)
57 | `Enc_iso885915 (* ISO-8859-15 *)
58 | `Enc_koi8r (* KOI8-R *)
59 | `Enc_jis0201 (* JIS-0201 *)
61 | `Enc_windows1250 (* WINDOWS-1250 *)
62 | `Enc_windows1251 (* WINDOWS-1251 *)
63 | `Enc_windows1252 (* WINDOWS-1252 *)
64 | `Enc_windows1253 (* WINDOWS-1253 *)
65 | `Enc_windows1254 (* WINDOWS-1254 *)
66 | `Enc_windows1255 (* WINDOWS-1255 *)
67 | `Enc_windows1256 (* WINDOWS-1256 *)
68 | `Enc_windows1257 (* WINDOWS-1257 *)
69 | `Enc_windows1258 (* WINDOWS-1258 *)
70 (* IBM, ASCII-based: *)
89 (* IBM, EBCDIC-based: *)
96 | `Enc_adobe_standard_encoding
97 | `Enc_adobe_symbol_encoding
98 | `Enc_adobe_zapf_dingbats_encoding
105 val encoding_of_string : string -> encoding;;
106 (* Returns the encoding of the name of the encoding. Fails if the
107 * encoding is unknown.
108 * E.g. encoding_of_string "iso-8859-1" = `Enc_iso88591
111 val string_of_encoding : encoding -> string;;
112 (* Returns the name of the encoding. *)
115 val makechar : encoding -> int -> string
117 * Creates the string representing the code point i in encoding enc.
118 * Raises Not_found if the character is legal but cannot be represented
121 * Possible encodings: everything but `Enc_utf16.
124 val recode : in_enc:encoding ->
133 subst:(int -> string) -> (int * int * encoding)
135 * let (in_n, out_n, in_enc') =
136 * recode in_enc in_buf in_len out_enc out_buf out_pos out_len max_chars
138 * Converts the character sequence contained in the at most in_len bytes
139 * of in_buf starting at position in_pos, and writes the result
140 * into at most out_len bytes of out_buf starting at out_pos.
141 * At most max_chars are written into out_buf.
142 * The characters in in_buf are assumed to be encoded as in_enc, and the
143 * characters in out_buf will be encoded as out_enc.
144 * If there is a code point which cannot be represented in out_enc,
145 * the function subst is called with the code point as argument, and the
146 * resulting string (which must already be encoded as out_enc) is
148 * Note: It is possible that subst is called several times for the same
150 * Return value: out_n is the actual number of bytes written into out_buf.
151 * in_n is the actual number of bytes that have been converted from
152 * in_buf; in_n may be smaller than in_len because of incomplete
153 * multi-byte characters, or because the output buffer has less space
154 * for characters than the input buffer, or because of a change
155 * of the encoding variant.
156 * If there is at least one complete character in in_buf, and at least
157 * space for one complete character in out_buf, and max_chars >= 1, it is
158 * guaranteed that in_n > 0 or out_n > 0.
159 * in_enc' is normally identical to in_enc. However, there are cases
160 * in which the encoding can be refined when looking at the byte
161 * sequence; for example whether a little endian or big endian variant
162 * of the encoding is used. in_enc' is the variant of in_enc that was
163 * used for the last character that has been converted.
167 * Supported range of code points: 0 to 0xd7ff, 0xe000 to 0xfffd,
168 * 0x10000 to 0x10ffff.
170 * Enc_utf8: Malformed UTF-8 byte sequences are always rejected. This
171 * is also true for the sequence 0xc0 0x80 which is used by some software
172 * (Java) as paraphrase for the code point 0.
174 * Enc_utf16: When reading from a string encoded as Enc_utf16, a byte
175 * order mark is expected at the beginning. The detected variant
176 * (Enc_utf16_le or Enc_utf16_be) is returned. The byte order mark is
177 * not included into the output string. - It is not possible to
178 * write as Enc_utf16.
180 * Enc_utf16_le, Enc_utf16_be: When reading from such a string, the
181 * code point 0xfeff is returned as it is; it is a "zero-width
182 * non-breaking space". The code point 0xfffe is rejected.
184 * Surrogate pairs: These are recognized (or written) only for a
185 * UTF-16 encoding; and rejected for any other encoding.
187 * Rejected byte sequences cause the exception Bad_character_stream.
190 val recode_string : in_enc:encoding ->
192 ?subst:(int -> string) ->
195 (* Recodes a complete string from in_enc to out_enc, and returns it.
196 * The function subst is invoked for code points of in_enc that cannot
197 * be represented in out_enc, and the result of the function invocation
199 * If subst is missing, Not_found is raised in this case.
202 (* ======================================================================
206 * Revision 1.1 2000/11/17 09:57:28 lpadovan
209 * Revision 1.1 2000/08/13 00:02:57 gerd
213 * ======================================================================
214 * OLD LOGS FROM THE PXP PACKAGE (FILE NAME pxp_encoding.mli):
216 * Revision 1.4 2000/07/04 22:05:58 gerd
217 * Enhanced version of 'recode'. Labeled arguments.
218 * New function 'recode_string'.
220 * Revision 1.3 2000/05/29 23:48:38 gerd
221 * Changed module names:
222 * Markup_aux into Pxp_aux
223 * Markup_codewriter into Pxp_codewriter
224 * Markup_document into Pxp_document
225 * Markup_dtd into Pxp_dtd
226 * Markup_entity into Pxp_entity
227 * Markup_lexer_types into Pxp_lexer_types
228 * Markup_reader into Pxp_reader
229 * Markup_types into Pxp_types
230 * Markup_yacc into Pxp_yacc
231 * See directory "compatibility" for (almost) compatible wrappers emulating
232 * Markup_document, Markup_dtd, Markup_reader, Markup_types, and Markup_yacc.
234 * Revision 1.2 2000/05/29 21:14:57 gerd
235 * Changed the type 'encoding' into a polymorphic variant.
237 * Revision 1.1 2000/05/20 20:30:50 gerd