(* $Id$
 * ----------------------------------------------------------------------
 *)

exception Malformed_code

(* Encodings:
 * - With the exception of UTF-8 and UTF-16, only single-byte character sets
 *   are supported.
 * - I took the mappings from www.unicode.org, and the standard names of
 *   the character sets from IANA. Obviously, many character sets are missing
 *   that can be supported; especially ISO646 character sets, many EBCDIC 
 *   code pages. 
 * - Because of the copyright statement from Unicode, I cannot put the
 *   source tables that describe the mappings into the distribution. They
 *   are publicly available from www.unicode.org.
 * - Because of this, it is difficult for you to extend the list of character 
 *   sets; you need the source tables I am not allowed to distribute.
 *   These tables have a very simple format: Every line describes a pair
 *   of code points; the left code (<= 0xff) is the code in the character
 *   set, the right code (<= 0xffff) is the Unicode equivalent.
 *   For an example, see
 *   http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-2.TXT
 *   You can send me such files, and I will integrate them into the 
 *   distribution (if possible).
 * - I really do not know very much about the character sets used in
 *   East Asia. If you need them, please write the necessary conversion
 *   functions and send them to me.
 *
 * KNOWN PROBLEMS:
 * - The following charsets do not have a bijective mapping to Unicode:
 *   adobe_standard_encoding, adobe_symbol_encoding, 
 *   adobe_zapf_dingbats_encoding, cp1002 (0xFEBE). The current implementation
 *   simply removes one of the conflicting code point pairs - this might
 *   not what you want.
 *)

type encoding =
  [  `Enc_utf8       (* UTF-8 *)
  |  `Enc_java       (* The variant of UTF-8 used by Java *)
  |  `Enc_utf16      (* UTF-16 with unspecified endianess (restricted usage) *)
  |  `Enc_utf16_le   (* UTF-16 little endian *)
  |  `Enc_utf16_be   (* UTF-16 big endian *)
  |  `Enc_usascii    (* US-ASCII (only 7 bit) *)
  |  `Enc_iso88591   (* ISO-8859-1 *)
  |  `Enc_iso88592   (* ISO-8859-2 *)
  |  `Enc_iso88593   (* ISO-8859-3 *)
  |  `Enc_iso88594   (* ISO-8859-4 *)
  |  `Enc_iso88595   (* ISO-8859-5 *)
  |  `Enc_iso88596   (* ISO-8859-6 *)
  |  `Enc_iso88597   (* ISO-8859-7 *)
  |  `Enc_iso88598   (* ISO-8859-8 *)
  |  `Enc_iso88599   (* ISO-8859-9 *)
  |  `Enc_iso885910  (* ISO-8859-10 *)
  |  `Enc_iso885913  (* ISO-8859-13 *)
  |  `Enc_iso885914  (* ISO-8859-14 *)
  |  `Enc_iso885915  (* ISO-8859-15 *)
  |  `Enc_koi8r      (* KOI8-R *)
  |  `Enc_jis0201    (* JIS-0201 *)
    (* Microsoft: *)
  |  `Enc_windows1250  (* WINDOWS-1250 *)
  |  `Enc_windows1251  (* WINDOWS-1251 *)
  |  `Enc_windows1252  (* WINDOWS-1252 *)
  |  `Enc_windows1253  (* WINDOWS-1253 *)
  |  `Enc_windows1254  (* WINDOWS-1254 *)
  |  `Enc_windows1255  (* WINDOWS-1255 *)
  |  `Enc_windows1256  (* WINDOWS-1256 *)
  |  `Enc_windows1257  (* WINDOWS-1257 *)
  |  `Enc_windows1258  (* WINDOWS-1258 *)
    (* IBM, ASCII-based: *)
  |  `Enc_cp437
  |  `Enc_cp737
  |  `Enc_cp775
  |  `Enc_cp850
  |  `Enc_cp852
  |  `Enc_cp855
  |  `Enc_cp856
  |  `Enc_cp857
  |  `Enc_cp860
  |  `Enc_cp861
  |  `Enc_cp862
  |  `Enc_cp863
  |  `Enc_cp864
  |  `Enc_cp865
  |  `Enc_cp866
  |  `Enc_cp869
  |  `Enc_cp874
  |  `Enc_cp1006
   (* IBM, EBCDIC-based: *)
  |  `Enc_cp037
  |  `Enc_cp424
  |  `Enc_cp500
  |  `Enc_cp875
  |  `Enc_cp1026
   (* Adobe: *)
  |  `Enc_adobe_standard_encoding
  |  `Enc_adobe_symbol_encoding
  |  `Enc_adobe_zapf_dingbats_encoding
   (* Apple: *)
  |  `Enc_macroman

  ]


val encoding_of_string : string -> encoding;;
    (* Returns the encoding of the name of the encoding. Fails if the 
     * encoding is unknown.
     * E.g. encoding_of_string "iso-8859-1" = `Enc_iso88591
     *)

val string_of_encoding : encoding -> string;;
    (* Returns the name of the encoding. *)


val makechar : encoding -> int -> string
  (* makechar enc i:
   * Creates the string representing the code point i in encoding enc.
   * Raises Not_found if the character is legal but cannot be represented 
   * in enc.
   * 
   * Possible encodings: everything but `Enc_utf16.
   *)

val recode : in_enc:encoding -> 
             in_buf:string -> 
	     in_pos:int ->
	     in_len:int -> 
	     out_enc:encoding -> 
	     out_buf:string -> 
	     out_pos:int ->
	     out_len:int ->
	     max_chars:int ->
             subst:(int -> string) -> (int * int * encoding)
  (* 
   * let (in_n, out_n, in_enc') = 
   *     recode in_enc in_buf in_len out_enc out_buf out_pos out_len max_chars 
   *            subst:
   * Converts the character sequence contained in the at most in_len bytes
   * of in_buf starting at position in_pos, and writes the result 
   * into at most out_len bytes of out_buf starting at out_pos.
   * At most max_chars are written into out_buf.
   * The characters in in_buf are assumed to be encoded as in_enc, and the 
   * characters in out_buf will be encoded as out_enc.
   * If there is a code point which cannot be represented in out_enc,
   * the function subst is called with the code point as argument, and the
   * resulting string (which must already be encoded as out_enc) is
   * inserted instead. 
   * Note: It is possible that subst is called several times for the same
   * character.
   * Return value: out_n is the actual number of bytes written into out_buf.
   * in_n is the actual number of bytes that have been converted from
   * in_buf; in_n may be smaller than in_len because of incomplete
   * multi-byte characters, or because the output buffer has less space
   * for characters than the input buffer, or because of a change
   * of the encoding variant.
   * If there is at least one complete character in in_buf, and at least
   * space for one complete character in out_buf, and max_chars >= 1, it is 
   * guaranteed that in_n > 0 or out_n > 0.
   * in_enc' is normally identical to in_enc. However, there are cases
   * in which the encoding can be refined when looking at the byte
   * sequence; for example whether a little endian or big endian variant
   * of the encoding is used. in_enc' is the variant of in_enc that was
   * used for the last character that has been converted.
   *
   * NOTES:
   *
   * Supported range of code points: 0 to 0xd7ff, 0xe000 to 0xfffd,
   * 0x10000 to 0x10ffff.
   *
   * Enc_utf8: Malformed UTF-8 byte sequences are always rejected. This
   * is also true for the sequence 0xc0 0x80 which is used by some software
   * (Java) as paraphrase for the code point 0.
   *
   * Enc_utf16: When reading from a string encoded as Enc_utf16, a byte
   * order mark is expected at the beginning. The detected variant 
   * (Enc_utf16_le or Enc_utf16_be) is returned. The byte order mark is
   * not included into the output string. - It is not possible to
   * write as Enc_utf16.
   *
   * Enc_utf16_le, Enc_utf16_be: When reading from such a string, the
   * code point 0xfeff is returned as it is; it is a "zero-width 
   * non-breaking space". The code point 0xfffe is rejected.
   *
   * Surrogate pairs: These are recognized (or written) only for a
   * UTF-16 encoding; and rejected for any other encoding.
   *
   * Rejected byte sequences cause the exception Bad_character_stream.
   *)

val recode_string : in_enc:encoding -> 
                    out_enc:encoding ->
		    ?subst:(int -> string) ->
		    string ->
                    string 
  (* Recodes a complete string from in_enc to out_enc, and returns it.
   * The function subst is invoked for code points of in_enc that cannot
   * be represented in out_enc, and the result of the function invocation
   * is substituted.
   * If subst is missing, Not_found is raised in this case.
   *)

(* ======================================================================
 * History:
 * 
 * $Log$
 * Revision 1.1  2000/11/17 09:57:28  lpadovan
 * Initial revision
 *
 * Revision 1.1  2000/08/13 00:02:57  gerd
 * 	Initial revision.
 *
 *
 * ======================================================================
 * OLD LOGS FROM THE PXP PACKAGE (FILE NAME pxp_encoding.mli):
 *
 * Revision 1.4  2000/07/04 22:05:58  gerd
 * 	Enhanced version of 'recode'. Labeled arguments.
 * New function 'recode_string'.
 *
 * Revision 1.3  2000/05/29 23:48:38  gerd
 * 	Changed module names:
 * 		Markup_aux          into Pxp_aux
 * 		Markup_codewriter   into Pxp_codewriter
 * 		Markup_document     into Pxp_document
 * 		Markup_dtd          into Pxp_dtd
 * 		Markup_entity       into Pxp_entity
 * 		Markup_lexer_types  into Pxp_lexer_types
 * 		Markup_reader       into Pxp_reader
 * 		Markup_types        into Pxp_types
 * 		Markup_yacc         into Pxp_yacc
 * See directory "compatibility" for (almost) compatible wrappers emulating
 * Markup_document, Markup_dtd, Markup_reader, Markup_types, and Markup_yacc.
 *
 * Revision 1.2  2000/05/29 21:14:57  gerd
 * 	Changed the type 'encoding' into a polymorphic variant.
 *
 * Revision 1.1  2000/05/20 20:30:50  gerd
 * 	Initial revision.
 *
 * 
 *)