2 * ----------------------------------------------------------------------
6 (* This module applies already O'Caml-3 features. *)
8 (* Uniform Resource Locators (URLs):
10 * This module provides functions to parse URLs, to print URLs, to
11 * store URLs, to modify URLs, and to apply relative URLs.
13 * URLs are strings formed according to pattern (1) or (2):
15 * (1) scheme://user:password@host:port/path;params?query#fragment
16 * (2) scheme:other;params?query#fragment
18 * The word at the beginning of the URL identifies the URL scheme
19 * (such as "http" or "file"). Depending on the scheme, not all of the
20 * parts are allowed, or parts may be omitted. This module defines the
21 * type 'url_syntax' whose values describe which parts are allowed/required/
22 * not allowed for a concrete URL scheme (see below).
24 * Not all characters are allowed in a URL. Some characters are allowed,
25 * but have the special task to separate the various parts of the URL
26 * (reserved characters).
27 * However, it is possible to include even invalid or reserved characters
28 * as normal content by applying the '%'-encoding on these characters:
29 * A '%' indicates that an encoded character follows, and the character
30 * is denoted by a two-digit hexadecimal number (e.g. %2f for '/').
31 * In the following descriptions, the term "encoded string" means a string
32 * containing such %-encoded characters, and the "decoded string" means a
33 * string not containing such characters.
34 * See the module Netencoding.Url for functions encoding or decoding
37 * The type 'url' describes values storing the components of a URL,
38 * and the 'url_syntax' for the URL. In general, the components are
39 * stored as encoded strings; however, not for all components the
40 * '%'-encoding is applicable.
41 * For convenience, the functions creating, modifying, and accessing
42 * URLs can handle both encoded and decoded strings. In order to
43 * avoid errors, the functions pass strings even in their decoded form.
45 * Note that there is currently no function to compare URLs. The
46 * canoncical comparison ( = ) is not applicable because the same URL
47 * may be written differently.
49 * Note that nothing is said about the character set/encoding of URLs.
50 * Some protocols and standards prefer UTF-8 as fundamental encoding
51 * and apply the '%'-encoding on top of it; i.e. the byte sequence
52 * representing a character in UTF-8 is '%'-encoded. There is no special
53 * support for this technique.
55 * For more information about URLs, see RFCs 1738 and 1808.
58 exception Malformed_URL
59 (* Is raised by a number of functions when encountering a badly formed
63 val extract_url_scheme : string -> string
64 (* Returns the URL scheme from the string representation of an URL.
65 * E.g. extract_url_scheme "http://host/path" = "http".
66 * The scheme name is always converted to lowercase characters.
67 * Raises Malformed_URL if the scheme name is not found.
70 type url_syntax_option =
71 Url_part_not_recognized
77 { url_enable_scheme : url_syntax_option;
78 url_enable_user : url_syntax_option;
79 url_enable_password : url_syntax_option;
80 url_enable_host : url_syntax_option;
81 url_enable_port : url_syntax_option;
82 url_enable_path : url_syntax_option;
83 url_enable_param : url_syntax_option;
84 url_enable_query : url_syntax_option;
85 url_enable_fragment : url_syntax_option;
86 url_enable_other : url_syntax_option;
87 url_accepts_8bits : bool;
88 url_is_valid : url -> bool;
94 (* Values of type 'url_syntax' describe which components of an URL are
95 * recognized, which are allowed (and optional), and which are required.
96 * Not all combinations are valid; the predicate expressed by the
97 * function 'url_syntax_is_valid' must hold.
98 * The function 'url_is_valid' is applied when a fresh URL is created
99 * and must return 'true'. This function allows it to add an arbitrary
100 * validity criterion to 'url_syntax'. (Note that the URL passed to
101 * this function is not fully working; you can safely assume that the
102 * accessor functions url_scheme etc. can be applied to it.)
104 * Switch 'url_accepts_8bit': If 'true', the bytes with code 128 to
105 * 255 are treated like alphanumeric characters; if 'false' these bytes
106 * are illegal (but it is still possible to include such byte in their
107 * encoded form: %80 to %FF).
109 * Values of type 'url' describe concrete URLs. Every URL must have
110 * a fundamental 'url_syntax', and it is only possible to create URLs
111 * conforming to the syntax. See 'make_url' for further information.
115 val url_syntax_is_valid : url_syntax -> bool
116 (* Checks whether the passed url_syntax is valid. This means:
118 * - If passwords are recognized, users (and hosts) must be recognized, too
119 * - If ports are recognized, hosts must be recognized, too
120 * - If users are recognized, hosts must be recognized, too
121 * - Either the syntax recognizes one of the phrases
122 * { user, password, host, port, path }, or the syntax recognized
123 * the phrase 'other'.
127 val partial_url_syntax : url_syntax -> url_syntax
128 (* Transforms the syntax into another syntax where all required parts are
129 * changed into optional parts.
133 (* Note that all following url_syntaxes do not allow 8bit bytes. *)
135 val null_url_syntax : url_syntax
137 val ip_url_syntax : url_syntax
138 (* Maximum syntax for IP based protocols *)
140 val common_url_syntax : (string, url_syntax) Hashtbl.t
141 (* Syntax descriptions for common URL schemes:
143 * null_url_syntax: nothing is recognized
145 * common_url_syntax: Hashtable mapping from URL scheme names to
146 * definitions of syntaxes:
148 * "file": scheme, host?, path
149 * "ftp": scheme, user?, password?, host, port?, path?, param?
150 * "http": scheme, user?, password?, host, port?, path?, query?
151 * "mailto": scheme, other
154 * (1) These syntax descriptions can be weakened for partial/relative URLs
155 * by changing the required parts to optional parts: See the function
156 * 'partial_url_syntax'.
157 * (2) None of the descriptions allows fragments. These can be enabled by
158 * setting 'url_enable_fragment' to Url_part_allowed. E.g.
159 * { file_url_syntax with url_enable_fragment = Url_part_allowed }
163 (* A URL without any component and 'null_url_syntax'
174 ?param:string list ->
180 (* Creates a URL from components:
182 * - The components "scheme" and "host" are simple strings to which the
183 * '%'-encoding is not applicable.
184 * - The component "port" is a simple number. Of course, the '%'-encoding
185 * is not applicable, too.
186 * - The components "user", "password", "query", "fragment", and "other"
187 * are strings which may contains '%'-encoded characters. By default,
188 * you can pass any string for these components, and problematic characters
189 * are automatically encoded. If you set ~encoded:true, the passed
190 * strings must already be encoded, but the function checks whether
191 * the encoding is correct.
192 * Note that for "query" even the characters '?' and '=' are encoded
193 * by default, so you need to set ~encoded:true to pass a reasonable
195 * - The components "path" and "param" are lists of strings which may
196 * contain '%'-encoded characters. Again, the default is to pass
197 * decoded strings to the function, and the function encodes them
198 * automatically, and by setting ~encoded:true the caller is responsible
199 * for encoding the strings.
200 * path = [] and params = [] mean that no path and no parameters are
201 * specified, respectively.
202 * See below for the respresentation of these components.
204 * Except of "path", the strings representing the components do not
205 * contain the characters separating the components from each other.
206 * The "path" component includes the '/' at the beginning of the path
209 * The created URL must conform to the 'url_syntax', i.e.
210 * - The URL must only contain components which are recognized by the
212 * - The URL must contain components which are required by the syntax
213 * - The URL must fulfill the predicate expressed by the 'url_is_valid'
214 * function of the syntax.
216 * The path of a URL is represented as a list of '/'-separated path
218 * [ s1; s2; ...; sN ] represents the path
219 * s1 ^ "/" ^ s2 ^ "/" ^ ... ^ "/" ^ sN
221 * [] is the non-existing path
223 * [ "";"" ] is illegal
225 * Except of s1 and sN, the path components must not be empty strings.
227 * To avoid ambiguities, it is illegal to create URLs with both relative
228 * paths (s1 <> "") and host components.
230 * Parameters of URLs are components beginning with ';'. The list
231 * of parameters is represented as list of strings where the strings
232 * contain the value following ';'.
236 ?syntax:url_syntax ->
244 ?param:string list ->
250 (* Modifies the passed components and returns the modified URL.
251 * The modfied URL shares unmodified components with the original
255 val remove_from_url :
268 (* Removes the 'true' components from the URL, and returns the modified
270 * The modfied URL shares unmodified components with the original
282 ?param:string list ->
288 (* Adds missing components and returns the modified URL.
289 * The modfied URL shares unmodified components with the original
300 ?param:string list ->
306 (* Removes components from the URL if they have the passed value, and
307 * returns the modified URL.
308 * Note: The values must always be passed in _encoded_ form!
309 * The modfied URL shares unmodified components with the original
313 val url_syntax_of_url : url -> url_syntax
314 (* Returns the 'url_syntax' record of a URL. *)
316 val url_of_string : url_syntax -> string -> url
317 (* Parses the passed string according to the passed url_syntax. *)
319 val string_of_url : url -> string
320 (* Returns the URL as string *)
335 (* Returns 'true' iff the URL has all of the components passed with
339 val url_scheme : url -> string
340 val url_user : ?encoded:bool -> url -> string
341 val url_password : ?encoded:bool -> url -> string
342 val url_host : url -> string
343 val url_port : url -> int
344 val url_path : ?encoded:bool -> url -> string list
345 val url_param : ?encoded:bool -> url -> string list
346 val url_query : ?encoded:bool -> url -> string
347 val url_fragment : ?encoded:bool -> url -> string
348 val url_other : ?encoded:bool -> url -> string
349 (* Return components of the URL. The functions return decoded strings
350 * unless ~encoded:true is set.
351 * If the component does not exist, the exception Not_found
355 val split_path : string -> string list
356 (* Splits a '/'-separated path into components (e.g. to set up the
357 * ~path argument of make_url).
358 * E.g. split_path "a/b/c" = [ "a"; "b"; "c" ],
359 * split_path "/a/b" = [ ""; "a"; "b" ],
360 * split_path "a/b/" = [ "a"; "b"; "" ]
363 val join_path : string list -> string
364 (* Concatenates the path components (reverse function of split_path).
367 val norm_path : string list -> string list
368 (* Removes "." and ".." from the path if possible. Deletes double slashes.
372 * norm_path ["."] = []
374 * norm_path ["."; ""] = []
376 * norm_path ["a"; "."] = ["a"; ""]
377 * means: "a/." = "a/"
378 * norm_path ["a"; "b"; "."] = ["a"; "b"; ""]
379 * means: "a/b/." = "a/b/"
380 * norm_path ["a"; "."; "b"; "."] = ["a"; "b"; ""]
381 * means: "a/./b/." = "a/b/"
382 * norm_path [".."] = [".."; ""]
383 * means: ".." = "../"
384 * norm_path [".."; ""] = [".."; ""]
385 * means: "../" = "../"
386 * norm_path ["a"; "b"; ".."; "c" ] = ["a"; "c"]
387 * means: "a/b/../c" = "a/c"
388 * norm_path ["a"; "b"; ".."; "c"; ""] = ["a"; "c"; ""]
389 * means: "a/b/../c/" = "a/c/"
390 * norm_path ["";"";"a";"";"b"] = [""; "a"; "b"]
391 * means: "//a//b" = "/a/b"
392 * norm_path ["a"; "b"; ""; ".."; "c"; ""] = ["a"; "c"; ""]
393 * means: "a/b//../c/" = "a/c/"
394 * norm_path ["a"; ".."] = []
399 val apply_relative_url : url -> url -> url
400 (* apply_relative_url base rel:
401 * Interprets 'rel' relative to 'base' and returns the new URL. This
402 * function implements RFC 1808.
405 val print_url : url -> unit
406 (* Printer for the toploop. *)
408 (* ---------------------------------------------------------------------- *)
412 * let http = Hashtbl.find common_url_syntax "http";;
413 * let u = url_of_string http "http://g:pw@host/a/%62/";;
415 * --> "http://g:pw@host/a/%62/"
425 * --> [ ""; "a"; "b"; "" ] (* sic! *)
426 * url_path ~encoded:true u;;
427 * --> [ ""; "a"; "%62"; "" ]
429 * ~path:[ ".."; "c" ]
430 * ~fragment:"near-the-#-character"
431 * { (partial_url_syntax http) with url_enable_fragment = Url_part_allowed };;
433 * --> "../c#near-the-%23-character"
434 * let u' = modify_url ~syntax:(url_syntax_of_url v) u;;
435 * (* u does not permit fragments *)
436 * let w = apply_relative_url u' v;;
438 * --> "http://g:pw@host/c#near-the-%23-character"
441 (* ======================================================================
445 * Revision 1.1 2000/11/17 09:57:27 lpadovan
448 * Revision 1.3 2000/06/26 22:57:49 gerd
449 * Change: The record 'url_syntax' has an additional component
450 * 'url_accepts_8bits'. Setting this option to 'true' causes that
451 * the bytes >= 0x80 are no longer rejected.
453 * Revision 1.2 2000/06/25 22:55:47 gerd
456 * Revision 1.1 2000/06/24 20:19:59 gerd