helm/DEVEL/pxp/netstring/netconversion.mli

   1 (* $Id$
   2  * ----------------------------------------------------------------------
   3  *)
   4
   5 exception Malformed_code
   6
   7 (* Encodings:
   8  * - With the exception of UTF-8 and UTF-16, only single-byte character sets
   9  *   are supported.
  10  * - I took the mappings from www.unicode.org, and the standard names of
  11  *   the character sets from IANA. Obviously, many character sets are missing
  12  *   that can be supported; especially ISO646 character sets, many EBCDIC
  13  *   code pages.
  14  * - Because of the copyright statement from Unicode, I cannot put the
  15  *   source tables that describe the mappings into the distribution. They
  16  *   are publicly available from www.unicode.org.
  17  * - Because of this, it is difficult for you to extend the list of character
  18  *   sets; you need the source tables I am not allowed to distribute.
  19  *   These tables have a very simple format: Every line describes a pair
  20  *   of code points; the left code (<= 0xff) is the code in the character
  21  *   set, the right code (<= 0xffff) is the Unicode equivalent.
  22  *   For an example, see
  23  *   http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-2.TXT
  24  *   You can send me such files, and I will integrate them into the
  25  *   distribution (if possible).
  26  * - I really do not know very much about the character sets used in
  27  *   East Asia. If you need them, please write the necessary conversion
  28  *   functions and send them to me.
  29  *
  30  * KNOWN PROBLEMS:
  31  * - The following charsets do not have a bijective mapping to Unicode:
  32  *   adobe_standard_encoding, adobe_symbol_encoding,
  33  *   adobe_zapf_dingbats_encoding, cp1002 (0xFEBE). The current implementation
  34  *   simply removes one of the conflicting code point pairs - this might
  35  *   not what you want.
  36  *)
  37
  38 type encoding =
  39   [  `Enc_utf8       (* UTF-8 *)
  40   |  `Enc_java       (* The variant of UTF-8 used by Java *)
  41   |  `Enc_utf16      (* UTF-16 with unspecified endianess (restricted usage) *)
  42   |  `Enc_utf16_le   (* UTF-16 little endian *)
  43   |  `Enc_utf16_be   (* UTF-16 big endian *)
  44   |  `Enc_usascii    (* US-ASCII (only 7 bit) *)
  45   |  `Enc_iso88591   (* ISO-8859-1 *)
  46   |  `Enc_iso88592   (* ISO-8859-2 *)
  47   |  `Enc_iso88593   (* ISO-8859-3 *)
  48   |  `Enc_iso88594   (* ISO-8859-4 *)
  49   |  `Enc_iso88595   (* ISO-8859-5 *)
  50   |  `Enc_iso88596   (* ISO-8859-6 *)
  51   |  `Enc_iso88597   (* ISO-8859-7 *)
  52   |  `Enc_iso88598   (* ISO-8859-8 *)
  53   |  `Enc_iso88599   (* ISO-8859-9 *)
  54   |  `Enc_iso885910  (* ISO-8859-10 *)
  55   |  `Enc_iso885913  (* ISO-8859-13 *)
  56   |  `Enc_iso885914  (* ISO-8859-14 *)
  57   |  `Enc_iso885915  (* ISO-8859-15 *)
  58   |  `Enc_koi8r      (* KOI8-R *)
  59   |  `Enc_jis0201    (* JIS-0201 *)
  60     (* Microsoft: *)
  61   |  `Enc_windows1250  (* WINDOWS-1250 *)
  62   |  `Enc_windows1251  (* WINDOWS-1251 *)
  63   |  `Enc_windows1252  (* WINDOWS-1252 *)
  64   |  `Enc_windows1253  (* WINDOWS-1253 *)
  65   |  `Enc_windows1254  (* WINDOWS-1254 *)
  66   |  `Enc_windows1255  (* WINDOWS-1255 *)
  67   |  `Enc_windows1256  (* WINDOWS-1256 *)
  68   |  `Enc_windows1257  (* WINDOWS-1257 *)
  69   |  `Enc_windows1258  (* WINDOWS-1258 *)
  70     (* IBM, ASCII-based: *)
  71   |  `Enc_cp437
  72   |  `Enc_cp737
  73   |  `Enc_cp775
  74   |  `Enc_cp850
  75   |  `Enc_cp852
  76   |  `Enc_cp855
  77   |  `Enc_cp856
  78   |  `Enc_cp857
  79   |  `Enc_cp860
  80   |  `Enc_cp861
  81   |  `Enc_cp862
  82   |  `Enc_cp863
  83   |  `Enc_cp864
  84   |  `Enc_cp865
  85   |  `Enc_cp866
  86   |  `Enc_cp869
  87   |  `Enc_cp874
  88   |  `Enc_cp1006
  89    (* IBM, EBCDIC-based: *)
  90   |  `Enc_cp037
  91   |  `Enc_cp424
  92   |  `Enc_cp500
  93   |  `Enc_cp875
  94   |  `Enc_cp1026
  95    (* Adobe: *)
  96   |  `Enc_adobe_standard_encoding
  97   |  `Enc_adobe_symbol_encoding
  98   |  `Enc_adobe_zapf_dingbats_encoding
  99    (* Apple: *)
 100   |  `Enc_macroman
 101
 102   ]
 103
 104
 105 val encoding_of_string : string -> encoding;;
 106     (* Returns the encoding of the name of the encoding. Fails if the
 107      * encoding is unknown.
 108      * E.g. encoding_of_string "iso-8859-1" = `Enc_iso88591
 109      *)
 110
 111 val string_of_encoding : encoding -> string;;
 112     (* Returns the name of the encoding. *)
 113
 114
 115 val makechar : encoding -> int -> string
 116   (* makechar enc i:
 117    * Creates the string representing the code point i in encoding enc.
 118    * Raises Not_found if the character is legal but cannot be represented
 119    * in enc.
 120    *
 121    * Possible encodings: everything but `Enc_utf16.
 122    *)
 123
 124 val recode : in_enc:encoding ->
 125              in_buf:string ->
 126              in_pos:int ->
 127              in_len:int ->
 128              out_enc:encoding ->
 129              out_buf:string ->
 130              out_pos:int ->
 131              out_len:int ->
 132              max_chars:int ->
 133              subst:(int -> string) -> (int * int * encoding)
 134   (*
 135    * let (in_n, out_n, in_enc') =
 136    *     recode in_enc in_buf in_len out_enc out_buf out_pos out_len max_chars
 137    *            subst:
 138    * Converts the character sequence contained in the at most in_len bytes
 139    * of in_buf starting at position in_pos, and writes the result
 140    * into at most out_len bytes of out_buf starting at out_pos.
 141    * At most max_chars are written into out_buf.
 142    * The characters in in_buf are assumed to be encoded as in_enc, and the
 143    * characters in out_buf will be encoded as out_enc.
 144    * If there is a code point which cannot be represented in out_enc,
 145    * the function subst is called with the code point as argument, and the
 146    * resulting string (which must already be encoded as out_enc) is
 147    * inserted instead.
 148    * Note: It is possible that subst is called several times for the same
 149    * character.
 150    * Return value: out_n is the actual number of bytes written into out_buf.
 151    * in_n is the actual number of bytes that have been converted from
 152    * in_buf; in_n may be smaller than in_len because of incomplete
 153    * multi-byte characters, or because the output buffer has less space
 154    * for characters than the input buffer, or because of a change
 155    * of the encoding variant.
 156    * If there is at least one complete character in in_buf, and at least
 157    * space for one complete character in out_buf, and max_chars >= 1, it is
 158    * guaranteed that in_n > 0 or out_n > 0.
 159    * in_enc' is normally identical to in_enc. However, there are cases
 160    * in which the encoding can be refined when looking at the byte
 161    * sequence; for example whether a little endian or big endian variant
 162    * of the encoding is used. in_enc' is the variant of in_enc that was
 163    * used for the last character that has been converted.
 164    *
 165    * NOTES:
 166    *
 167    * Supported range of code points: 0 to 0xd7ff, 0xe000 to 0xfffd,
 168    * 0x10000 to 0x10ffff.
 169    *
 170    * Enc_utf8: Malformed UTF-8 byte sequences are always rejected. This
 171    * is also true for the sequence 0xc0 0x80 which is used by some software
 172    * (Java) as paraphrase for the code point 0.
 173    *
 174    * Enc_utf16: When reading from a string encoded as Enc_utf16, a byte
 175    * order mark is expected at the beginning. The detected variant
 176    * (Enc_utf16_le or Enc_utf16_be) is returned. The byte order mark is
 177    * not included into the output string. - It is not possible to
 178    * write as Enc_utf16.
 179    *
 180    * Enc_utf16_le, Enc_utf16_be: When reading from such a string, the
 181    * code point 0xfeff is returned as it is; it is a "zero-width
 182    * non-breaking space". The code point 0xfffe is rejected.
 183    *
 184    * Surrogate pairs: These are recognized (or written) only for a
 185    * UTF-16 encoding; and rejected for any other encoding.
 186    *
 187    * Rejected byte sequences cause the exception Bad_character_stream.
 188    *)
 189
 190 val recode_string : in_enc:encoding ->
 191                     out_enc:encoding ->
 192                     ?subst:(int -> string) ->
 193                     string ->
 194                     string
 195   (* Recodes a complete string from in_enc to out_enc, and returns it.
 196    * The function subst is invoked for code points of in_enc that cannot
 197    * be represented in out_enc, and the result of the function invocation
 198    * is substituted.
 199    * If subst is missing, Not_found is raised in this case.
 200    *)
 201
 202 (* ======================================================================
 203  * History:
 204  *
 205  * $Log$
 206  * Revision 1.1  2000/11/17 09:57:28  lpadovan
 207  * Initial revision
 208  *
 209  * Revision 1.1  2000/08/13 00:02:57  gerd
 210  *      Initial revision.
 211  *
 212  *
 213  * ======================================================================
 214  * OLD LOGS FROM THE PXP PACKAGE (FILE NAME pxp_encoding.mli):
 215  *
 216  * Revision 1.4  2000/07/04 22:05:58  gerd
 217  *      Enhanced version of 'recode'. Labeled arguments.
 218  * New function 'recode_string'.
 219  *
 220  * Revision 1.3  2000/05/29 23:48:38  gerd
 221  *      Changed module names:
 222  *              Markup_aux          into Pxp_aux
 223  *              Markup_codewriter   into Pxp_codewriter
 224  *              Markup_document     into Pxp_document
 225  *              Markup_dtd          into Pxp_dtd
 226  *              Markup_entity       into Pxp_entity
 227  *              Markup_lexer_types  into Pxp_lexer_types
 228  *              Markup_reader       into Pxp_reader
 229  *              Markup_types        into Pxp_types
 230  *              Markup_yacc         into Pxp_yacc
 231  * See directory "compatibility" for (almost) compatible wrappers emulating
 232  * Markup_document, Markup_dtd, Markup_reader, Markup_types, and Markup_yacc.
 233  *
 234  * Revision 1.2  2000/05/29 21:14:57  gerd
 235  *      Changed the type 'encoding' into a polymorphic variant.
 236  *
 237  * Revision 1.1  2000/05/20 20:30:50  gerd
 238  *      Initial revision.
 239  *
 240  *
 241  *)