X-Git-Url: http://matita.cs.unibo.it/gitweb/?a=blobdiff_plain;f=helm%2FDEVEL%2Fpxp%2Fnetstring%2Fneturl.mli;fp=helm%2FDEVEL%2Fpxp%2Fnetstring%2Fneturl.mli;h=988aef6c89c275508eede6531f577e6ecb81537a;hb=c03d2c1fdab8d228cb88aaba5ca0f556318bebc5;hp=0000000000000000000000000000000000000000;hpb=758057e85325f94cd88583feb1fdf6b038e35055;p=helm.git diff --git a/helm/DEVEL/pxp/netstring/neturl.mli b/helm/DEVEL/pxp/netstring/neturl.mli new file mode 100644 index 000000000..988aef6c8 --- /dev/null +++ b/helm/DEVEL/pxp/netstring/neturl.mli @@ -0,0 +1,460 @@ +(* $Id$ + * ---------------------------------------------------------------------- + * + *) + +(* This module applies already O'Caml-3 features. *) + +(* Uniform Resource Locators (URLs): + * + * This module provides functions to parse URLs, to print URLs, to + * store URLs, to modify URLs, and to apply relative URLs. + * + * URLs are strings formed according to pattern (1) or (2): + * + * (1) scheme://user:password@host:port/path;params?query#fragment + * (2) scheme:other;params?query#fragment + * + * The word at the beginning of the URL identifies the URL scheme + * (such as "http" or "file"). Depending on the scheme, not all of the + * parts are allowed, or parts may be omitted. This module defines the + * type 'url_syntax' whose values describe which parts are allowed/required/ + * not allowed for a concrete URL scheme (see below). + * + * Not all characters are allowed in a URL. Some characters are allowed, + * but have the special task to separate the various parts of the URL + * (reserved characters). + * However, it is possible to include even invalid or reserved characters + * as normal content by applying the '%'-encoding on these characters: + * A '%' indicates that an encoded character follows, and the character + * is denoted by a two-digit hexadecimal number (e.g. %2f for '/'). + * In the following descriptions, the term "encoded string" means a string + * containing such %-encoded characters, and the "decoded string" means a + * string not containing such characters. + * See the module Netencoding.Url for functions encoding or decoding + * strings. + * + * The type 'url' describes values storing the components of a URL, + * and the 'url_syntax' for the URL. In general, the components are + * stored as encoded strings; however, not for all components the + * '%'-encoding is applicable. + * For convenience, the functions creating, modifying, and accessing + * URLs can handle both encoded and decoded strings. In order to + * avoid errors, the functions pass strings even in their decoded form. + * + * Note that there is currently no function to compare URLs. The + * canoncical comparison ( = ) is not applicable because the same URL + * may be written differently. + * + * Note that nothing is said about the character set/encoding of URLs. + * Some protocols and standards prefer UTF-8 as fundamental encoding + * and apply the '%'-encoding on top of it; i.e. the byte sequence + * representing a character in UTF-8 is '%'-encoded. There is no special + * support for this technique. + * + * For more information about URLs, see RFCs 1738 and 1808. + *) + +exception Malformed_URL +(* Is raised by a number of functions when encountering a badly formed + * URL. + *) + +val extract_url_scheme : string -> string + (* Returns the URL scheme from the string representation of an URL. + * E.g. extract_url_scheme "http://host/path" = "http". + * The scheme name is always converted to lowercase characters. + * Raises Malformed_URL if the scheme name is not found. + *) + +type url_syntax_option = + Url_part_not_recognized + | Url_part_allowed + | Url_part_required + + +type url_syntax = + { url_enable_scheme : url_syntax_option; + url_enable_user : url_syntax_option; + url_enable_password : url_syntax_option; + url_enable_host : url_syntax_option; + url_enable_port : url_syntax_option; + url_enable_path : url_syntax_option; + url_enable_param : url_syntax_option; + url_enable_query : url_syntax_option; + url_enable_fragment : url_syntax_option; + url_enable_other : url_syntax_option; + url_accepts_8bits : bool; + url_is_valid : url -> bool; + } + +and url +;; + +(* Values of type 'url_syntax' describe which components of an URL are + * recognized, which are allowed (and optional), and which are required. + * Not all combinations are valid; the predicate expressed by the + * function 'url_syntax_is_valid' must hold. + * The function 'url_is_valid' is applied when a fresh URL is created + * and must return 'true'. This function allows it to add an arbitrary + * validity criterion to 'url_syntax'. (Note that the URL passed to + * this function is not fully working; you can safely assume that the + * accessor functions url_scheme etc. can be applied to it.) + * + * Switch 'url_accepts_8bit': If 'true', the bytes with code 128 to + * 255 are treated like alphanumeric characters; if 'false' these bytes + * are illegal (but it is still possible to include such byte in their + * encoded form: %80 to %FF). + * + * Values of type 'url' describe concrete URLs. Every URL must have + * a fundamental 'url_syntax', and it is only possible to create URLs + * conforming to the syntax. See 'make_url' for further information. + *) + + +val url_syntax_is_valid : url_syntax -> bool + (* Checks whether the passed url_syntax is valid. This means: + * + * - If passwords are recognized, users (and hosts) must be recognized, too + * - If ports are recognized, hosts must be recognized, too + * - If users are recognized, hosts must be recognized, too + * - Either the syntax recognizes one of the phrases + * { user, password, host, port, path }, or the syntax recognized + * the phrase 'other'. + *) + + +val partial_url_syntax : url_syntax -> url_syntax + (* Transforms the syntax into another syntax where all required parts are + * changed into optional parts. + *) + + +(* Note that all following url_syntaxes do not allow 8bit bytes. *) + +val null_url_syntax : url_syntax + +val ip_url_syntax : url_syntax + (* Maximum syntax for IP based protocols *) + +val common_url_syntax : (string, url_syntax) Hashtbl.t + (* Syntax descriptions for common URL schemes: + * + * null_url_syntax: nothing is recognized + * + * common_url_syntax: Hashtable mapping from URL scheme names to + * definitions of syntaxes: + * + * "file": scheme, host?, path + * "ftp": scheme, user?, password?, host, port?, path?, param? + * "http": scheme, user?, password?, host, port?, path?, query? + * "mailto": scheme, other + * + * Notes: + * (1) These syntax descriptions can be weakened for partial/relative URLs + * by changing the required parts to optional parts: See the function + * 'partial_url_syntax'. + * (2) None of the descriptions allows fragments. These can be enabled by + * setting 'url_enable_fragment' to Url_part_allowed. E.g. + * { file_url_syntax with url_enable_fragment = Url_part_allowed } + *) + +val null_url : url + (* A URL without any component and 'null_url_syntax' + *) + +val make_url : + ?encoded:bool -> + ?scheme:string -> + ?user:string -> + ?password:string -> + ?host:string -> + ?port:int -> + ?path:string list -> + ?param:string list -> + ?query:string -> + ?fragment:string -> + ?other:string -> + url_syntax -> + url + (* Creates a URL from components: + * + * - The components "scheme" and "host" are simple strings to which the + * '%'-encoding is not applicable. + * - The component "port" is a simple number. Of course, the '%'-encoding + * is not applicable, too. + * - The components "user", "password", "query", "fragment", and "other" + * are strings which may contains '%'-encoded characters. By default, + * you can pass any string for these components, and problematic characters + * are automatically encoded. If you set ~encoded:true, the passed + * strings must already be encoded, but the function checks whether + * the encoding is correct. + * Note that for "query" even the characters '?' and '=' are encoded + * by default, so you need to set ~encoded:true to pass a reasonable + * query string. + * - The components "path" and "param" are lists of strings which may + * contain '%'-encoded characters. Again, the default is to pass + * decoded strings to the function, and the function encodes them + * automatically, and by setting ~encoded:true the caller is responsible + * for encoding the strings. + * path = [] and params = [] mean that no path and no parameters are + * specified, respectively. + * See below for the respresentation of these components. + * + * Except of "path", the strings representing the components do not + * contain the characters separating the components from each other. + * The "path" component includes the '/' at the beginning of the path + * (if present). + * + * The created URL must conform to the 'url_syntax', i.e. + * - The URL must only contain components which are recognized by the + * syntax + * - The URL must contain components which are required by the syntax + * - The URL must fulfill the predicate expressed by the 'url_is_valid' + * function of the syntax. + * + * The path of a URL is represented as a list of '/'-separated path + * components. i.e. + * [ s1; s2; ...; sN ] represents the path + * s1 ^ "/" ^ s2 ^ "/" ^ ... ^ "/" ^ sN + * As special cases: + * [] is the non-existing path + * [ "" ] is "/" + * [ "";"" ] is illegal + * + * Except of s1 and sN, the path components must not be empty strings. + * + * To avoid ambiguities, it is illegal to create URLs with both relative + * paths (s1 <> "") and host components. + * + * Parameters of URLs are components beginning with ';'. The list + * of parameters is represented as list of strings where the strings + * contain the value following ';'. + *) + +val modify_url : + ?syntax:url_syntax -> + ?encoded:bool -> + ?scheme:string -> + ?user:string -> + ?password:string -> + ?host:string -> + ?port:int -> + ?path:string list -> + ?param:string list -> + ?query:string -> + ?fragment:string -> + ?other:string -> + url -> + url + (* Modifies the passed components and returns the modified URL. + * The modfied URL shares unmodified components with the original + * URL. + *) + +val remove_from_url : + ?scheme:bool -> + ?user:bool -> + ?password:bool -> + ?host:bool -> + ?port:bool -> + ?path:bool -> + ?param:bool -> + ?query:bool -> + ?fragment:bool -> + ?other:bool -> + url -> + url + (* Removes the 'true' components from the URL, and returns the modified + * URL. + * The modfied URL shares unmodified components with the original + * URL. + *) + +val default_url : + ?encoded:bool -> + ?scheme:string -> + ?user:string -> + ?password:string -> + ?host:string -> + ?port:int -> + ?path:string list -> + ?param:string list -> + ?query:string -> + ?fragment:string -> + ?other:string -> + url -> + url + (* Adds missing components and returns the modified URL. + * The modfied URL shares unmodified components with the original + * URL. + *) + +val undefault_url : + ?scheme:string -> + ?user:string -> + ?password:string -> + ?host:string -> + ?port:int -> + ?path:string list -> + ?param:string list -> + ?query:string -> + ?fragment:string -> + ?other:string -> + url -> + url + (* Removes components from the URL if they have the passed value, and + * returns the modified URL. + * Note: The values must always be passed in _encoded_ form! + * The modfied URL shares unmodified components with the original + * URL. + *) + +val url_syntax_of_url : url -> url_syntax + (* Returns the 'url_syntax' record of a URL. *) + +val url_of_string : url_syntax -> string -> url + (* Parses the passed string according to the passed url_syntax. *) + +val string_of_url : url -> string + (* Returns the URL as string *) + +val url_provides : + ?scheme:bool -> + ?user:bool -> + ?password:bool -> + ?host:bool -> + ?port:bool -> + ?path:bool -> + ?param:bool -> + ?query:bool -> + ?fragment:bool -> + ?other:bool -> + url -> + bool + (* Returns 'true' iff the URL has all of the components passed with + * 'true' value. + *) + +val url_scheme : url -> string +val url_user : ?encoded:bool -> url -> string +val url_password : ?encoded:bool -> url -> string +val url_host : url -> string +val url_port : url -> int +val url_path : ?encoded:bool -> url -> string list +val url_param : ?encoded:bool -> url -> string list +val url_query : ?encoded:bool -> url -> string +val url_fragment : ?encoded:bool -> url -> string +val url_other : ?encoded:bool -> url -> string + (* Return components of the URL. The functions return decoded strings + * unless ~encoded:true is set. + * If the component does not exist, the exception Not_found + * is raised. + *) + +val split_path : string -> string list + (* Splits a '/'-separated path into components (e.g. to set up the + * ~path argument of make_url). + * E.g. split_path "a/b/c" = [ "a"; "b"; "c" ], + * split_path "/a/b" = [ ""; "a"; "b" ], + * split_path "a/b/" = [ "a"; "b"; "" ] + *) + +val join_path : string list -> string + (* Concatenates the path components (reverse function of split_path). + *) + +val norm_path : string list -> string list + (* Removes "." and ".." from the path if possible. Deletes double slashes. + * + * EXAMPLES: + * + * norm_path ["."] = [] + * means: "." = "" + * norm_path ["."; ""] = [] + * means: "./" = "" + * norm_path ["a"; "."] = ["a"; ""] + * means: "a/." = "a/" + * norm_path ["a"; "b"; "."] = ["a"; "b"; ""] + * means: "a/b/." = "a/b/" + * norm_path ["a"; "."; "b"; "."] = ["a"; "b"; ""] + * means: "a/./b/." = "a/b/" + * norm_path [".."] = [".."; ""] + * means: ".." = "../" + * norm_path [".."; ""] = [".."; ""] + * means: "../" = "../" + * norm_path ["a"; "b"; ".."; "c" ] = ["a"; "c"] + * means: "a/b/../c" = "a/c" + * norm_path ["a"; "b"; ".."; "c"; ""] = ["a"; "c"; ""] + * means: "a/b/../c/" = "a/c/" + * norm_path ["";"";"a";"";"b"] = [""; "a"; "b"] + * means: "//a//b" = "/a/b" + * norm_path ["a"; "b"; ""; ".."; "c"; ""] = ["a"; "c"; ""] + * means: "a/b//../c/" = "a/c/" + * norm_path ["a"; ".."] = [] + * means: "a/.." = "" + *) + + +val apply_relative_url : url -> url -> url + (* apply_relative_url base rel: + * Interprets 'rel' relative to 'base' and returns the new URL. This + * function implements RFC 1808. + *) + +val print_url : url -> unit + (* Printer for the toploop. *) + +(* ---------------------------------------------------------------------- *) + +(* EXAMPLES: + * + * let http = Hashtbl.find common_url_syntax "http";; + * let u = url_of_string http "http://g:pw@host/a/%62/";; + * string_of_url u;; + * --> "http://g:pw@host/a/%62/" + * url_scheme u;; + * --> "http" + * url_user u;; + * --> "g" + * url_password u;; + * --> "pw" + * url_host u;; + * --> "host" + * url_path u;; + * --> [ ""; "a"; "b"; "" ] (* sic! *) + * url_path ~encoded:true u;; + * --> [ ""; "a"; "%62"; "" ] + * let v = make_url + * ~path:[ ".."; "c" ] + * ~fragment:"near-the-#-character" + * { (partial_url_syntax http) with url_enable_fragment = Url_part_allowed };; + * string_of_url v;; + * --> "../c#near-the-%23-character" + * let u' = modify_url ~syntax:(url_syntax_of_url v) u;; + * (* u does not permit fragments *) + * let w = apply_relative_url u' v;; + * string_of_url w;; + * --> "http://g:pw@host/c#near-the-%23-character" + *) + +(* ====================================================================== + * History: + * + * $Log$ + * Revision 1.1 2000/11/17 09:57:27 lpadovan + * Initial revision + * + * Revision 1.3 2000/06/26 22:57:49 gerd + * Change: The record 'url_syntax' has an additional component + * 'url_accepts_8bits'. Setting this option to 'true' causes that + * the bytes >= 0x80 are no longer rejected. + * + * Revision 1.2 2000/06/25 22:55:47 gerd + * Doc update. + * + * Revision 1.1 2000/06/24 20:19:59 gerd + * Initial revision. + * + * + *)