+++ /dev/null
-(* $Id$
- * ----------------------------------------------------------------------
- *)
-
-exception Malformed_code
-
-(* Encodings:
- * - With the exception of UTF-8 and UTF-16, only single-byte character sets
- * are supported.
- * - I took the mappings from www.unicode.org, and the standard names of
- * the character sets from IANA. Obviously, many character sets are missing
- * that can be supported; especially ISO646 character sets, many EBCDIC
- * code pages.
- * - Because of the copyright statement from Unicode, I cannot put the
- * source tables that describe the mappings into the distribution. They
- * are publicly available from www.unicode.org.
- * - Because of this, it is difficult for you to extend the list of character
- * sets; you need the source tables I am not allowed to distribute.
- * These tables have a very simple format: Every line describes a pair
- * of code points; the left code (<= 0xff) is the code in the character
- * set, the right code (<= 0xffff) is the Unicode equivalent.
- * For an example, see
- * http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-2.TXT
- * You can send me such files, and I will integrate them into the
- * distribution (if possible).
- * - I really do not know very much about the character sets used in
- * East Asia. If you need them, please write the necessary conversion
- * functions and send them to me.
- *
- * KNOWN PROBLEMS:
- * - The following charsets do not have a bijective mapping to Unicode:
- * adobe_standard_encoding, adobe_symbol_encoding,
- * adobe_zapf_dingbats_encoding, cp1002 (0xFEBE). The current implementation
- * simply removes one of the conflicting code point pairs - this might
- * not what you want.
- *)
-
-type encoding =
- [ `Enc_utf8 (* UTF-8 *)
- | `Enc_java (* The variant of UTF-8 used by Java *)
- | `Enc_utf16 (* UTF-16 with unspecified endianess (restricted usage) *)
- | `Enc_utf16_le (* UTF-16 little endian *)
- | `Enc_utf16_be (* UTF-16 big endian *)
- | `Enc_usascii (* US-ASCII (only 7 bit) *)
- | `Enc_iso88591 (* ISO-8859-1 *)
- | `Enc_iso88592 (* ISO-8859-2 *)
- | `Enc_iso88593 (* ISO-8859-3 *)
- | `Enc_iso88594 (* ISO-8859-4 *)
- | `Enc_iso88595 (* ISO-8859-5 *)
- | `Enc_iso88596 (* ISO-8859-6 *)
- | `Enc_iso88597 (* ISO-8859-7 *)
- | `Enc_iso88598 (* ISO-8859-8 *)
- | `Enc_iso88599 (* ISO-8859-9 *)
- | `Enc_iso885910 (* ISO-8859-10 *)
- | `Enc_iso885913 (* ISO-8859-13 *)
- | `Enc_iso885914 (* ISO-8859-14 *)
- | `Enc_iso885915 (* ISO-8859-15 *)
- | `Enc_koi8r (* KOI8-R *)
- | `Enc_jis0201 (* JIS-0201 *)
- (* Microsoft: *)
- | `Enc_windows1250 (* WINDOWS-1250 *)
- | `Enc_windows1251 (* WINDOWS-1251 *)
- | `Enc_windows1252 (* WINDOWS-1252 *)
- | `Enc_windows1253 (* WINDOWS-1253 *)
- | `Enc_windows1254 (* WINDOWS-1254 *)
- | `Enc_windows1255 (* WINDOWS-1255 *)
- | `Enc_windows1256 (* WINDOWS-1256 *)
- | `Enc_windows1257 (* WINDOWS-1257 *)
- | `Enc_windows1258 (* WINDOWS-1258 *)
- (* IBM, ASCII-based: *)
- | `Enc_cp437
- | `Enc_cp737
- | `Enc_cp775
- | `Enc_cp850
- | `Enc_cp852
- | `Enc_cp855
- | `Enc_cp856
- | `Enc_cp857
- | `Enc_cp860
- | `Enc_cp861
- | `Enc_cp862
- | `Enc_cp863
- | `Enc_cp864
- | `Enc_cp865
- | `Enc_cp866
- | `Enc_cp869
- | `Enc_cp874
- | `Enc_cp1006
- (* IBM, EBCDIC-based: *)
- | `Enc_cp037
- | `Enc_cp424
- | `Enc_cp500
- | `Enc_cp875
- | `Enc_cp1026
- (* Adobe: *)
- | `Enc_adobe_standard_encoding
- | `Enc_adobe_symbol_encoding
- | `Enc_adobe_zapf_dingbats_encoding
- (* Apple: *)
- | `Enc_macroman
-
- ]
-
-
-val encoding_of_string : string -> encoding;;
- (* Returns the encoding of the name of the encoding. Fails if the
- * encoding is unknown.
- * E.g. encoding_of_string "iso-8859-1" = `Enc_iso88591
- *)
-
-val string_of_encoding : encoding -> string;;
- (* Returns the name of the encoding. *)
-
-
-val makechar : encoding -> int -> string
- (* makechar enc i:
- * Creates the string representing the code point i in encoding enc.
- * Raises Not_found if the character is legal but cannot be represented
- * in enc.
- *
- * Possible encodings: everything but `Enc_utf16.
- *)
-
-val recode : in_enc:encoding ->
- in_buf:string ->
- in_pos:int ->
- in_len:int ->
- out_enc:encoding ->
- out_buf:string ->
- out_pos:int ->
- out_len:int ->
- max_chars:int ->
- subst:(int -> string) -> (int * int * encoding)
- (*
- * let (in_n, out_n, in_enc') =
- * recode in_enc in_buf in_len out_enc out_buf out_pos out_len max_chars
- * subst:
- * Converts the character sequence contained in the at most in_len bytes
- * of in_buf starting at position in_pos, and writes the result
- * into at most out_len bytes of out_buf starting at out_pos.
- * At most max_chars are written into out_buf.
- * The characters in in_buf are assumed to be encoded as in_enc, and the
- * characters in out_buf will be encoded as out_enc.
- * If there is a code point which cannot be represented in out_enc,
- * the function subst is called with the code point as argument, and the
- * resulting string (which must already be encoded as out_enc) is
- * inserted instead.
- * Note: It is possible that subst is called several times for the same
- * character.
- * Return value: out_n is the actual number of bytes written into out_buf.
- * in_n is the actual number of bytes that have been converted from
- * in_buf; in_n may be smaller than in_len because of incomplete
- * multi-byte characters, or because the output buffer has less space
- * for characters than the input buffer, or because of a change
- * of the encoding variant.
- * If there is at least one complete character in in_buf, and at least
- * space for one complete character in out_buf, and max_chars >= 1, it is
- * guaranteed that in_n > 0 or out_n > 0.
- * in_enc' is normally identical to in_enc. However, there are cases
- * in which the encoding can be refined when looking at the byte
- * sequence; for example whether a little endian or big endian variant
- * of the encoding is used. in_enc' is the variant of in_enc that was
- * used for the last character that has been converted.
- *
- * NOTES:
- *
- * Supported range of code points: 0 to 0xd7ff, 0xe000 to 0xfffd,
- * 0x10000 to 0x10ffff.
- *
- * Enc_utf8: Malformed UTF-8 byte sequences are always rejected. This
- * is also true for the sequence 0xc0 0x80 which is used by some software
- * (Java) as paraphrase for the code point 0.
- *
- * Enc_utf16: When reading from a string encoded as Enc_utf16, a byte
- * order mark is expected at the beginning. The detected variant
- * (Enc_utf16_le or Enc_utf16_be) is returned. The byte order mark is
- * not included into the output string. - It is not possible to
- * write as Enc_utf16.
- *
- * Enc_utf16_le, Enc_utf16_be: When reading from such a string, the
- * code point 0xfeff is returned as it is; it is a "zero-width
- * non-breaking space". The code point 0xfffe is rejected.
- *
- * Surrogate pairs: These are recognized (or written) only for a
- * UTF-16 encoding; and rejected for any other encoding.
- *
- * Rejected byte sequences cause the exception Bad_character_stream.
- *)
-
-val recode_string : in_enc:encoding ->
- out_enc:encoding ->
- ?subst:(int -> string) ->
- string ->
- string
- (* Recodes a complete string from in_enc to out_enc, and returns it.
- * The function subst is invoked for code points of in_enc that cannot
- * be represented in out_enc, and the result of the function invocation
- * is substituted.
- * If subst is missing, Not_found is raised in this case.
- *)
-
-(* ======================================================================
- * History:
- *
- * $Log$
- * Revision 1.1 2000/11/17 09:57:28 lpadovan
- * Initial revision
- *
- * Revision 1.1 2000/08/13 00:02:57 gerd
- * Initial revision.
- *
- *
- * ======================================================================
- * OLD LOGS FROM THE PXP PACKAGE (FILE NAME pxp_encoding.mli):
- *
- * Revision 1.4 2000/07/04 22:05:58 gerd
- * Enhanced version of 'recode'. Labeled arguments.
- * New function 'recode_string'.
- *
- * Revision 1.3 2000/05/29 23:48:38 gerd
- * Changed module names:
- * Markup_aux into Pxp_aux
- * Markup_codewriter into Pxp_codewriter
- * Markup_document into Pxp_document
- * Markup_dtd into Pxp_dtd
- * Markup_entity into Pxp_entity
- * Markup_lexer_types into Pxp_lexer_types
- * Markup_reader into Pxp_reader
- * Markup_types into Pxp_types
- * Markup_yacc into Pxp_yacc
- * See directory "compatibility" for (almost) compatible wrappers emulating
- * Markup_document, Markup_dtd, Markup_reader, Markup_types, and Markup_yacc.
- *
- * Revision 1.2 2000/05/29 21:14:57 gerd
- * Changed the type 'encoding' into a polymorphic variant.
- *
- * Revision 1.1 2000/05/20 20:30:50 gerd
- * Initial revision.
- *
- *
- *)