(* $Id$ * ---------------------------------------------------------------------- * *) (* This module applies already O'Caml-3 features. *) (* Uniform Resource Locators (URLs): * * This module provides functions to parse URLs, to print URLs, to * store URLs, to modify URLs, and to apply relative URLs. * * URLs are strings formed according to pattern (1) or (2): * * (1) scheme://user:password@host:port/path;params?query#fragment * (2) scheme:other;params?query#fragment * * The word at the beginning of the URL identifies the URL scheme * (such as "http" or "file"). Depending on the scheme, not all of the * parts are allowed, or parts may be omitted. This module defines the * type 'url_syntax' whose values describe which parts are allowed/required/ * not allowed for a concrete URL scheme (see below). * * Not all characters are allowed in a URL. Some characters are allowed, * but have the special task to separate the various parts of the URL * (reserved characters). * However, it is possible to include even invalid or reserved characters * as normal content by applying the '%'-encoding on these characters: * A '%' indicates that an encoded character follows, and the character * is denoted by a two-digit hexadecimal number (e.g. %2f for '/'). * In the following descriptions, the term "encoded string" means a string * containing such %-encoded characters, and the "decoded string" means a * string not containing such characters. * See the module Netencoding.Url for functions encoding or decoding * strings. * * The type 'url' describes values storing the components of a URL, * and the 'url_syntax' for the URL. In general, the components are * stored as encoded strings; however, not for all components the * '%'-encoding is applicable. * For convenience, the functions creating, modifying, and accessing * URLs can handle both encoded and decoded strings. In order to * avoid errors, the functions pass strings even in their decoded form. * * Note that there is currently no function to compare URLs. The * canoncical comparison ( = ) is not applicable because the same URL * may be written differently. * * Note that nothing is said about the character set/encoding of URLs. * Some protocols and standards prefer UTF-8 as fundamental encoding * and apply the '%'-encoding on top of it; i.e. the byte sequence * representing a character in UTF-8 is '%'-encoded. There is no special * support for this technique. * * For more information about URLs, see RFCs 1738 and 1808. *) exception Malformed_URL (* Is raised by a number of functions when encountering a badly formed * URL. *) val extract_url_scheme : string -> string (* Returns the URL scheme from the string representation of an URL. * E.g. extract_url_scheme "http://host/path" = "http". * The scheme name is always converted to lowercase characters. * Raises Malformed_URL if the scheme name is not found. *) type url_syntax_option = Url_part_not_recognized | Url_part_allowed | Url_part_required type url_syntax = { url_enable_scheme : url_syntax_option; url_enable_user : url_syntax_option; url_enable_password : url_syntax_option; url_enable_host : url_syntax_option; url_enable_port : url_syntax_option; url_enable_path : url_syntax_option; url_enable_param : url_syntax_option; url_enable_query : url_syntax_option; url_enable_fragment : url_syntax_option; url_enable_other : url_syntax_option; url_accepts_8bits : bool; url_is_valid : url -> bool; } and url ;; (* Values of type 'url_syntax' describe which components of an URL are * recognized, which are allowed (and optional), and which are required. * Not all combinations are valid; the predicate expressed by the * function 'url_syntax_is_valid' must hold. * The function 'url_is_valid' is applied when a fresh URL is created * and must return 'true'. This function allows it to add an arbitrary * validity criterion to 'url_syntax'. (Note that the URL passed to * this function is not fully working; you can safely assume that the * accessor functions url_scheme etc. can be applied to it.) * * Switch 'url_accepts_8bit': If 'true', the bytes with code 128 to * 255 are treated like alphanumeric characters; if 'false' these bytes * are illegal (but it is still possible to include such byte in their * encoded form: %80 to %FF). * * Values of type 'url' describe concrete URLs. Every URL must have * a fundamental 'url_syntax', and it is only possible to create URLs * conforming to the syntax. See 'make_url' for further information. *) val url_syntax_is_valid : url_syntax -> bool (* Checks whether the passed url_syntax is valid. This means: * * - If passwords are recognized, users (and hosts) must be recognized, too * - If ports are recognized, hosts must be recognized, too * - If users are recognized, hosts must be recognized, too * - Either the syntax recognizes one of the phrases * { user, password, host, port, path }, or the syntax recognized * the phrase 'other'. *) val partial_url_syntax : url_syntax -> url_syntax (* Transforms the syntax into another syntax where all required parts are * changed into optional parts. *) (* Note that all following url_syntaxes do not allow 8bit bytes. *) val null_url_syntax : url_syntax val ip_url_syntax : url_syntax (* Maximum syntax for IP based protocols *) val common_url_syntax : (string, url_syntax) Hashtbl.t (* Syntax descriptions for common URL schemes: * * null_url_syntax: nothing is recognized * * common_url_syntax: Hashtable mapping from URL scheme names to * definitions of syntaxes: * * "file": scheme, host?, path * "ftp": scheme, user?, password?, host, port?, path?, param? * "http": scheme, user?, password?, host, port?, path?, query? * "mailto": scheme, other * * Notes: * (1) These syntax descriptions can be weakened for partial/relative URLs * by changing the required parts to optional parts: See the function * 'partial_url_syntax'. * (2) None of the descriptions allows fragments. These can be enabled by * setting 'url_enable_fragment' to Url_part_allowed. E.g. * { file_url_syntax with url_enable_fragment = Url_part_allowed } *) val null_url : url (* A URL without any component and 'null_url_syntax' *) val make_url : ?encoded:bool -> ?scheme:string -> ?user:string -> ?password:string -> ?host:string -> ?port:int -> ?path:string list -> ?param:string list -> ?query:string -> ?fragment:string -> ?other:string -> url_syntax -> url (* Creates a URL from components: * * - The components "scheme" and "host" are simple strings to which the * '%'-encoding is not applicable. * - The component "port" is a simple number. Of course, the '%'-encoding * is not applicable, too. * - The components "user", "password", "query", "fragment", and "other" * are strings which may contains '%'-encoded characters. By default, * you can pass any string for these components, and problematic characters * are automatically encoded. If you set ~encoded:true, the passed * strings must already be encoded, but the function checks whether * the encoding is correct. * Note that for "query" even the characters '?' and '=' are encoded * by default, so you need to set ~encoded:true to pass a reasonable * query string. * - The components "path" and "param" are lists of strings which may * contain '%'-encoded characters. Again, the default is to pass * decoded strings to the function, and the function encodes them * automatically, and by setting ~encoded:true the caller is responsible * for encoding the strings. * path = [] and params = [] mean that no path and no parameters are * specified, respectively. * See below for the respresentation of these components. * * Except of "path", the strings representing the components do not * contain the characters separating the components from each other. * The "path" component includes the '/' at the beginning of the path * (if present). * * The created URL must conform to the 'url_syntax', i.e. * - The URL must only contain components which are recognized by the * syntax * - The URL must contain components which are required by the syntax * - The URL must fulfill the predicate expressed by the 'url_is_valid' * function of the syntax. * * The path of a URL is represented as a list of '/'-separated path * components. i.e. * [ s1; s2; ...; sN ] represents the path * s1 ^ "/" ^ s2 ^ "/" ^ ... ^ "/" ^ sN * As special cases: * [] is the non-existing path * [ "" ] is "/" * [ "";"" ] is illegal * * Except of s1 and sN, the path components must not be empty strings. * * To avoid ambiguities, it is illegal to create URLs with both relative * paths (s1 <> "") and host components. * * Parameters of URLs are components beginning with ';'. The list * of parameters is represented as list of strings where the strings * contain the value following ';'. *) val modify_url : ?syntax:url_syntax -> ?encoded:bool -> ?scheme:string -> ?user:string -> ?password:string -> ?host:string -> ?port:int -> ?path:string list -> ?param:string list -> ?query:string -> ?fragment:string -> ?other:string -> url -> url (* Modifies the passed components and returns the modified URL. * The modfied URL shares unmodified components with the original * URL. *) val remove_from_url : ?scheme:bool -> ?user:bool -> ?password:bool -> ?host:bool -> ?port:bool -> ?path:bool -> ?param:bool -> ?query:bool -> ?fragment:bool -> ?other:bool -> url -> url (* Removes the 'true' components from the URL, and returns the modified * URL. * The modfied URL shares unmodified components with the original * URL. *) val default_url : ?encoded:bool -> ?scheme:string -> ?user:string -> ?password:string -> ?host:string -> ?port:int -> ?path:string list -> ?param:string list -> ?query:string -> ?fragment:string -> ?other:string -> url -> url (* Adds missing components and returns the modified URL. * The modfied URL shares unmodified components with the original * URL. *) val undefault_url : ?scheme:string -> ?user:string -> ?password:string -> ?host:string -> ?port:int -> ?path:string list -> ?param:string list -> ?query:string -> ?fragment:string -> ?other:string -> url -> url (* Removes components from the URL if they have the passed value, and * returns the modified URL. * Note: The values must always be passed in _encoded_ form! * The modfied URL shares unmodified components with the original * URL. *) val url_syntax_of_url : url -> url_syntax (* Returns the 'url_syntax' record of a URL. *) val url_of_string : url_syntax -> string -> url (* Parses the passed string according to the passed url_syntax. *) val string_of_url : url -> string (* Returns the URL as string *) val url_provides : ?scheme:bool -> ?user:bool -> ?password:bool -> ?host:bool -> ?port:bool -> ?path:bool -> ?param:bool -> ?query:bool -> ?fragment:bool -> ?other:bool -> url -> bool (* Returns 'true' iff the URL has all of the components passed with * 'true' value. *) val url_scheme : url -> string val url_user : ?encoded:bool -> url -> string val url_password : ?encoded:bool -> url -> string val url_host : url -> string val url_port : url -> int val url_path : ?encoded:bool -> url -> string list val url_param : ?encoded:bool -> url -> string list val url_query : ?encoded:bool -> url -> string val url_fragment : ?encoded:bool -> url -> string val url_other : ?encoded:bool -> url -> string (* Return components of the URL. The functions return decoded strings * unless ~encoded:true is set. * If the component does not exist, the exception Not_found * is raised. *) val split_path : string -> string list (* Splits a '/'-separated path into components (e.g. to set up the * ~path argument of make_url). * E.g. split_path "a/b/c" = [ "a"; "b"; "c" ], * split_path "/a/b" = [ ""; "a"; "b" ], * split_path "a/b/" = [ "a"; "b"; "" ] *) val join_path : string list -> string (* Concatenates the path components (reverse function of split_path). *) val norm_path : string list -> string list (* Removes "." and ".." from the path if possible. Deletes double slashes. * * EXAMPLES: * * norm_path ["."] = [] * means: "." = "" * norm_path ["."; ""] = [] * means: "./" = "" * norm_path ["a"; "."] = ["a"; ""] * means: "a/." = "a/" * norm_path ["a"; "b"; "."] = ["a"; "b"; ""] * means: "a/b/." = "a/b/" * norm_path ["a"; "."; "b"; "."] = ["a"; "b"; ""] * means: "a/./b/." = "a/b/" * norm_path [".."] = [".."; ""] * means: ".." = "../" * norm_path [".."; ""] = [".."; ""] * means: "../" = "../" * norm_path ["a"; "b"; ".."; "c" ] = ["a"; "c"] * means: "a/b/../c" = "a/c" * norm_path ["a"; "b"; ".."; "c"; ""] = ["a"; "c"; ""] * means: "a/b/../c/" = "a/c/" * norm_path ["";"";"a";"";"b"] = [""; "a"; "b"] * means: "//a//b" = "/a/b" * norm_path ["a"; "b"; ""; ".."; "c"; ""] = ["a"; "c"; ""] * means: "a/b//../c/" = "a/c/" * norm_path ["a"; ".."] = [] * means: "a/.." = "" *) val apply_relative_url : url -> url -> url (* apply_relative_url base rel: * Interprets 'rel' relative to 'base' and returns the new URL. This * function implements RFC 1808. *) val print_url : url -> unit (* Printer for the toploop. *) (* ---------------------------------------------------------------------- *) (* EXAMPLES: * * let http = Hashtbl.find common_url_syntax "http";; * let u = url_of_string http "http://g:pw@host/a/%62/";; * string_of_url u;; * --> "http://g:pw@host/a/%62/" * url_scheme u;; * --> "http" * url_user u;; * --> "g" * url_password u;; * --> "pw" * url_host u;; * --> "host" * url_path u;; * --> [ ""; "a"; "b"; "" ] (* sic! *) * url_path ~encoded:true u;; * --> [ ""; "a"; "%62"; "" ] * let v = make_url * ~path:[ ".."; "c" ] * ~fragment:"near-the-#-character" * { (partial_url_syntax http) with url_enable_fragment = Url_part_allowed };; * string_of_url v;; * --> "../c#near-the-%23-character" * let u' = modify_url ~syntax:(url_syntax_of_url v) u;; * (* u does not permit fragments *) * let w = apply_relative_url u' v;; * string_of_url w;; * --> "http://g:pw@host/c#near-the-%23-character" *) (* ====================================================================== * History: * * $Log$ * Revision 1.1 2000/11/17 09:57:27 lpadovan * Initial revision * * Revision 1.3 2000/06/26 22:57:49 gerd * Change: The record 'url_syntax' has an additional component * 'url_accepts_8bits'. Setting this option to 'true' causes that * the bytes >= 0x80 are no longer rejected. * * Revision 1.2 2000/06/25 22:55:47 gerd * Doc update. * * Revision 1.1 2000/06/24 20:19:59 gerd * Initial revision. * * *)