--- /dev/null
+(* $Id$
+ * ----------------------------------------------------------------------
+ *
+ *)
+
+open Nethtml_scanner;;
+
+type document =
+ Element of (string * (string*string) list * document list)
+ | Data of string
+;;
+
+
+exception End_of_scan;;
+
+
+let no_end_tag = (* empty HTML elements *)
+ ref
+ [ "isindex";
+ "base";
+ "meta";
+ "link";
+ "hr";
+ "input";
+ "img";
+ "param";
+ "basefont";
+ "br";
+ "area";
+ ]
+;;
+
+
+let special_tag = (* other lexical rules *)
+ ref
+ [ "script";
+ "style";
+ ]
+;;
+
+
+let rec parse_comment buf =
+ let t = scan_comment buf in
+ match t with
+ Mcomment ->
+ parse_comment buf
+ | Eof ->
+ raise End_of_scan
+ | _ ->
+ ()
+;;
+
+
+let rec parse_doctype buf =
+ let t = scan_doctype buf in
+ match t with
+ Mdoctype ->
+ parse_doctype buf
+ | Eof ->
+ raise End_of_scan
+ | _ ->
+ ()
+;;
+
+
+let parse_document buf =
+ let current_name = ref "" in
+ let current_atts = ref [] in
+ let current_subs = ref [] in
+ let stack = Stack.create() in
+
+ let parse_atts() =
+ let rec next_no_space() =
+ match scan_element buf with
+ Space _ -> next_no_space()
+ | t -> t
+ in
+
+ let rec parse_atts_lookahead next =
+ match next with
+ Relement -> []
+ | Name n ->
+ begin match next_no_space() with
+ Is ->
+ begin match next_no_space() with
+ Name v ->
+ (String.lowercase n, String.uppercase v) ::
+ parse_atts_lookahead (next_no_space())
+ | Literal v ->
+ (String.lowercase n,v) ::
+ parse_atts_lookahead (next_no_space())
+ | Eof ->
+ raise End_of_scan
+ | Relement ->
+ (* Illegal *)
+ []
+ | _ ->
+ (* Illegal *)
+ parse_atts_lookahead (next_no_space())
+ end
+ | Eof ->
+ raise End_of_scan
+ | Relement ->
+ (* <tag name> <==> <tag name="name"> *)
+ [ String.lowercase n, String.lowercase n ]
+ | next' ->
+ (* assume <tag name ... > <==> <tag name="name" ...> *)
+ ( String.lowercase n, String.lowercase n ) ::
+ parse_atts_lookahead next'
+ end
+ | Eof ->
+ raise End_of_scan
+ | _ ->
+ (* Illegal *)
+ parse_atts_lookahead (next_no_space())
+ in
+ parse_atts_lookahead (next_no_space())
+ in
+
+ let rec parse_special name =
+ (* Parse until </name> *)
+ match scan_special buf with
+ Lelementend n ->
+ if n = name then
+ ""
+ else
+ "</" ^ n ^ parse_special name
+ | Eof ->
+ raise End_of_scan
+ | Cdata s ->
+ s ^ parse_special name
+ | _ ->
+ (* Illegal *)
+ parse_special name
+ in
+
+ let rec skip_element() =
+ (* Skip until ">" *)
+ match scan_element buf with
+ Relement ->
+ ()
+ | Eof ->
+ raise End_of_scan
+ | _ ->
+ skip_element()
+ in
+
+ let rec parse_next() =
+ let t = scan_document buf in
+ match t with
+ Lcomment ->
+ parse_comment buf;
+ parse_next()
+ | Ldoctype ->
+ parse_doctype buf;
+ parse_next()
+ | Lelement name ->
+ let name = String.lowercase name in
+ if List.mem name !no_end_tag then begin
+ let atts = parse_atts() in
+ current_subs := (Element(name, atts, [])) :: !current_subs;
+ parse_next()
+ end
+ else if List.mem name !special_tag then begin
+ let atts = parse_atts() in
+ let data = parse_special name in
+ (* Read until ">" *)
+ skip_element();
+ current_subs := (Element(name, atts, [Data data])) :: !current_subs;
+ parse_next()
+ end
+ else begin
+ let atts = parse_atts() in
+ Stack.push (!current_name, !current_atts, !current_subs) stack;
+ current_name := name;
+ current_atts := atts;
+ current_subs := [];
+ parse_next()
+ end
+ | Cdata data ->
+ current_subs := (Data data) :: !current_subs;
+ parse_next()
+ | Lelementend name ->
+ let name = String.lowercase name in
+ (* Read until ">" *)
+ skip_element();
+ (* Search the element to close on the stack: *)
+ let found = ref (name = !current_name) in
+ Stack.iter
+ (fun (old_name, _, _) ->
+ if name = old_name then found := true)
+ stack;
+ (* If not found, the end tag is wrong. Simply ignore it. *)
+ if not !found then
+ parse_next()
+ else begin
+ (* Put the current element on to the stack: *)
+ Stack.push (!current_name, !current_atts, !current_subs) stack;
+ (* If found: Remove the elements from the stack, and append
+ * them to the previous element as sub elements
+ *)
+ let rec remove() =
+ let old_name, old_atts, old_subs = Stack.pop stack in
+ (* or raise Stack.Empty *)
+ if old_name = name then
+ old_name, old_atts, old_subs
+ else
+ let older_name, older_atts, older_subs = remove() in
+ older_name,
+ older_atts,
+ (Element (old_name, old_atts, List.rev old_subs) :: older_subs)
+ in
+ let old_name, old_atts, old_subs = remove() in
+ (* Remove one more element: the element containing the element
+ * currently being closed.
+ *)
+ let new_name, new_atts, new_subs = Stack.pop stack in
+ current_name := new_name;
+ current_atts := new_atts;
+ current_subs := (Element (old_name, old_atts, List.rev old_subs))
+ :: new_subs;
+ (* Go on *)
+ parse_next()
+ end
+ | Eof ->
+ raise End_of_scan
+ | _ ->
+ parse_next()
+ in
+ try
+ parse_next();
+ List.rev !current_subs
+ with
+ End_of_scan ->
+ (* Close all remaining elements: *)
+ Stack.push (!current_name, !current_atts, !current_subs) stack;
+ let rec remove() =
+ let old_name, old_atts, old_subs = Stack.pop stack in
+ (* or raise Stack.Empty *)
+ try
+ let older_name, older_atts, older_subs = remove() in
+ older_name,
+ older_atts,
+ (Element (old_name, old_atts, List.rev old_subs) :: older_subs)
+ with
+ Stack.Empty ->
+ old_name, old_atts, old_subs
+ in
+ let name, atts, subs = remove() in
+ List.rev subs
+;;
+
+
+let parse_string s =
+ let buf = Lexing.from_string s in
+ parse_document buf
+;;
+
+
+let parse_file fd =
+ let buf = Lexing.from_channel fd in
+ parse_document buf
+;;
+
+(* ======================================================================
+ * History:
+ *
+ * $Log$
+ * Revision 1.1 2000/11/17 09:57:28 lpadovan
+ * Initial revision
+ *
+ * Revision 1.1 2000/03/03 01:07:25 gerd
+ * Initial revision.
+ *
+ *
+ *)