(* $Id$
* ----------------------------------------------------------------------
(* The type 'document' represents parsed HTML documents.
* Element (name, args, subnodes): is an element node for an element of
* type 'name' (i.e. written ... ) with arguments 'args'
* and subnodes 'subnodes' (the material within the element). The arguments
* are simply name/value pairs. Entity references (something like %xy;)
* occuring in the values are NOT resolved.
* Arguments without values (e.g. : here,
* "multiple" is such an argument) are represented as (name,name), i.e. the
* name is returned as value.
* As argument names are case-insensitive, the names are all lowercase.
* Data s: is a character data node. Again, entity references are contained
* as such and not as what they mean.
type document =
Element of (string * (string*string) list * document list)
| Data of string
val no_end_tag : string list ref;;
(* List of tags which are always empty. This variable is pre-configured,
* but you may want to change it.
* It is important to know which elements are always empty, because HTML
* allows it to omit the end tag for them. For example,
* x is parsed as
* Element("a",[],[ Element("b",[],[]); Data "x" ])
* if we know that "a" is an empty element, but it is wrongly parsed as
* Element("a",[],[ Element("b",[], [ Data "x"]) ])
* if "a" is actually empty but we do not know it.
* An example of such a tag is "br".
val special_tag : string list ref;;
(* List of tags with a special rule for recognizing the end.
* This variable is pre-configured, but you may want to change it.
* The special rule is that the metacharacters '<', '>' and so on lose
* their meaning within the element, and that only the corresponding
* end tag stops this kind of scanning. An example is the element
* "javascript". Inner elements are not recognized, and the element
* can only be ended by . (Other elements are also ended
* if an embracing element ends, e.g. "j" in !)
* Note that comments are not recognized within special elements;
* comments are returned as character material.
val parse_string : string -> document list
(* Parses the HTML document from a string and returns it *)
val parse_file : in_channel -> document list
(* Parses the HTML document from a file and returns it *)
(* ======================================================================
* History:
* $Log$
* Revision 1.1 2000/11/17 09:57:28 lpadovan
* Initial revision
* Revision 1.1 2000/03/03 01:07:25 gerd
* Initial revision.