X-Git-Url: http://matita.cs.unibo.it/gitweb/?a=blobdiff_plain;f=helm%2FDEVEL%2Fpxp%2Fnetstring%2Fnetconversion.mli;fp=helm%2FDEVEL%2Fpxp%2Fnetstring%2Fnetconversion.mli;h=5e3e4b4e11dd55dab3d556f6a32571841619bde0;hb=c03d2c1fdab8d228cb88aaba5ca0f556318bebc5;hp=0000000000000000000000000000000000000000;hpb=758057e85325f94cd88583feb1fdf6b038e35055;p=helm.git diff --git a/helm/DEVEL/pxp/netstring/netconversion.mli b/helm/DEVEL/pxp/netstring/netconversion.mli new file mode 100644 index 000000000..5e3e4b4e1 --- /dev/null +++ b/helm/DEVEL/pxp/netstring/netconversion.mli @@ -0,0 +1,241 @@ +(* $Id$ + * ---------------------------------------------------------------------- + *) + +exception Malformed_code + +(* Encodings: + * - With the exception of UTF-8 and UTF-16, only single-byte character sets + * are supported. + * - I took the mappings from www.unicode.org, and the standard names of + * the character sets from IANA. Obviously, many character sets are missing + * that can be supported; especially ISO646 character sets, many EBCDIC + * code pages. + * - Because of the copyright statement from Unicode, I cannot put the + * source tables that describe the mappings into the distribution. They + * are publicly available from www.unicode.org. + * - Because of this, it is difficult for you to extend the list of character + * sets; you need the source tables I am not allowed to distribute. + * These tables have a very simple format: Every line describes a pair + * of code points; the left code (<= 0xff) is the code in the character + * set, the right code (<= 0xffff) is the Unicode equivalent. + * For an example, see + * http://www.unicode.org/Public/MAPPINGS/ISO8859/8859-2.TXT + * You can send me such files, and I will integrate them into the + * distribution (if possible). + * - I really do not know very much about the character sets used in + * East Asia. If you need them, please write the necessary conversion + * functions and send them to me. + * + * KNOWN PROBLEMS: + * - The following charsets do not have a bijective mapping to Unicode: + * adobe_standard_encoding, adobe_symbol_encoding, + * adobe_zapf_dingbats_encoding, cp1002 (0xFEBE). The current implementation + * simply removes one of the conflicting code point pairs - this might + * not what you want. + *) + +type encoding = + [ `Enc_utf8 (* UTF-8 *) + | `Enc_java (* The variant of UTF-8 used by Java *) + | `Enc_utf16 (* UTF-16 with unspecified endianess (restricted usage) *) + | `Enc_utf16_le (* UTF-16 little endian *) + | `Enc_utf16_be (* UTF-16 big endian *) + | `Enc_usascii (* US-ASCII (only 7 bit) *) + | `Enc_iso88591 (* ISO-8859-1 *) + | `Enc_iso88592 (* ISO-8859-2 *) + | `Enc_iso88593 (* ISO-8859-3 *) + | `Enc_iso88594 (* ISO-8859-4 *) + | `Enc_iso88595 (* ISO-8859-5 *) + | `Enc_iso88596 (* ISO-8859-6 *) + | `Enc_iso88597 (* ISO-8859-7 *) + | `Enc_iso88598 (* ISO-8859-8 *) + | `Enc_iso88599 (* ISO-8859-9 *) + | `Enc_iso885910 (* ISO-8859-10 *) + | `Enc_iso885913 (* ISO-8859-13 *) + | `Enc_iso885914 (* ISO-8859-14 *) + | `Enc_iso885915 (* ISO-8859-15 *) + | `Enc_koi8r (* KOI8-R *) + | `Enc_jis0201 (* JIS-0201 *) + (* Microsoft: *) + | `Enc_windows1250 (* WINDOWS-1250 *) + | `Enc_windows1251 (* WINDOWS-1251 *) + | `Enc_windows1252 (* WINDOWS-1252 *) + | `Enc_windows1253 (* WINDOWS-1253 *) + | `Enc_windows1254 (* WINDOWS-1254 *) + | `Enc_windows1255 (* WINDOWS-1255 *) + | `Enc_windows1256 (* WINDOWS-1256 *) + | `Enc_windows1257 (* WINDOWS-1257 *) + | `Enc_windows1258 (* WINDOWS-1258 *) + (* IBM, ASCII-based: *) + | `Enc_cp437 + | `Enc_cp737 + | `Enc_cp775 + | `Enc_cp850 + | `Enc_cp852 + | `Enc_cp855 + | `Enc_cp856 + | `Enc_cp857 + | `Enc_cp860 + | `Enc_cp861 + | `Enc_cp862 + | `Enc_cp863 + | `Enc_cp864 + | `Enc_cp865 + | `Enc_cp866 + | `Enc_cp869 + | `Enc_cp874 + | `Enc_cp1006 + (* IBM, EBCDIC-based: *) + | `Enc_cp037 + | `Enc_cp424 + | `Enc_cp500 + | `Enc_cp875 + | `Enc_cp1026 + (* Adobe: *) + | `Enc_adobe_standard_encoding + | `Enc_adobe_symbol_encoding + | `Enc_adobe_zapf_dingbats_encoding + (* Apple: *) + | `Enc_macroman + + ] + + +val encoding_of_string : string -> encoding;; + (* Returns the encoding of the name of the encoding. Fails if the + * encoding is unknown. + * E.g. encoding_of_string "iso-8859-1" = `Enc_iso88591 + *) + +val string_of_encoding : encoding -> string;; + (* Returns the name of the encoding. *) + + +val makechar : encoding -> int -> string + (* makechar enc i: + * Creates the string representing the code point i in encoding enc. + * Raises Not_found if the character is legal but cannot be represented + * in enc. + * + * Possible encodings: everything but `Enc_utf16. + *) + +val recode : in_enc:encoding -> + in_buf:string -> + in_pos:int -> + in_len:int -> + out_enc:encoding -> + out_buf:string -> + out_pos:int -> + out_len:int -> + max_chars:int -> + subst:(int -> string) -> (int * int * encoding) + (* + * let (in_n, out_n, in_enc') = + * recode in_enc in_buf in_len out_enc out_buf out_pos out_len max_chars + * subst: + * Converts the character sequence contained in the at most in_len bytes + * of in_buf starting at position in_pos, and writes the result + * into at most out_len bytes of out_buf starting at out_pos. + * At most max_chars are written into out_buf. + * The characters in in_buf are assumed to be encoded as in_enc, and the + * characters in out_buf will be encoded as out_enc. + * If there is a code point which cannot be represented in out_enc, + * the function subst is called with the code point as argument, and the + * resulting string (which must already be encoded as out_enc) is + * inserted instead. + * Note: It is possible that subst is called several times for the same + * character. + * Return value: out_n is the actual number of bytes written into out_buf. + * in_n is the actual number of bytes that have been converted from + * in_buf; in_n may be smaller than in_len because of incomplete + * multi-byte characters, or because the output buffer has less space + * for characters than the input buffer, or because of a change + * of the encoding variant. + * If there is at least one complete character in in_buf, and at least + * space for one complete character in out_buf, and max_chars >= 1, it is + * guaranteed that in_n > 0 or out_n > 0. + * in_enc' is normally identical to in_enc. However, there are cases + * in which the encoding can be refined when looking at the byte + * sequence; for example whether a little endian or big endian variant + * of the encoding is used. in_enc' is the variant of in_enc that was + * used for the last character that has been converted. + * + * NOTES: + * + * Supported range of code points: 0 to 0xd7ff, 0xe000 to 0xfffd, + * 0x10000 to 0x10ffff. + * + * Enc_utf8: Malformed UTF-8 byte sequences are always rejected. This + * is also true for the sequence 0xc0 0x80 which is used by some software + * (Java) as paraphrase for the code point 0. + * + * Enc_utf16: When reading from a string encoded as Enc_utf16, a byte + * order mark is expected at the beginning. The detected variant + * (Enc_utf16_le or Enc_utf16_be) is returned. The byte order mark is + * not included into the output string. - It is not possible to + * write as Enc_utf16. + * + * Enc_utf16_le, Enc_utf16_be: When reading from such a string, the + * code point 0xfeff is returned as it is; it is a "zero-width + * non-breaking space". The code point 0xfffe is rejected. + * + * Surrogate pairs: These are recognized (or written) only for a + * UTF-16 encoding; and rejected for any other encoding. + * + * Rejected byte sequences cause the exception Bad_character_stream. + *) + +val recode_string : in_enc:encoding -> + out_enc:encoding -> + ?subst:(int -> string) -> + string -> + string + (* Recodes a complete string from in_enc to out_enc, and returns it. + * The function subst is invoked for code points of in_enc that cannot + * be represented in out_enc, and the result of the function invocation + * is substituted. + * If subst is missing, Not_found is raised in this case. + *) + +(* ====================================================================== + * History: + * + * $Log$ + * Revision 1.1 2000/11/17 09:57:28 lpadovan + * Initial revision + * + * Revision 1.1 2000/08/13 00:02:57 gerd + * Initial revision. + * + * + * ====================================================================== + * OLD LOGS FROM THE PXP PACKAGE (FILE NAME pxp_encoding.mli): + * + * Revision 1.4 2000/07/04 22:05:58 gerd + * Enhanced version of 'recode'. Labeled arguments. + * New function 'recode_string'. + * + * Revision 1.3 2000/05/29 23:48:38 gerd + * Changed module names: + * Markup_aux into Pxp_aux + * Markup_codewriter into Pxp_codewriter + * Markup_document into Pxp_document + * Markup_dtd into Pxp_dtd + * Markup_entity into Pxp_entity + * Markup_lexer_types into Pxp_lexer_types + * Markup_reader into Pxp_reader + * Markup_types into Pxp_types + * Markup_yacc into Pxp_yacc + * See directory "compatibility" for (almost) compatible wrappers emulating + * Markup_document, Markup_dtd, Markup_reader, Markup_types, and Markup_yacc. + * + * Revision 1.2 2000/05/29 21:14:57 gerd + * Changed the type 'encoding' into a polymorphic variant. + * + * Revision 1.1 2000/05/20 20:30:50 gerd + * Initial revision. + * + * + *)